Repository: FEX-Emu/FEX Branch: main Commit: 5c4c468d1326 Files: 3243 Total size: 18.8 MB Directory structure: gitextract_1dff4jwg/ ├── .clang-format ├── .clang-format-ignore ├── .git-blame-ignore-revs ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ └── potential-game-bug.md │ └── workflows/ │ ├── ccpp.yml │ ├── glibc_fault.yml │ ├── hostrunner.yml │ ├── instcountci.yml │ ├── mingw_build.yml │ ├── pr-code-format.yml │ ├── setup-env/ │ │ └── action.yml │ ├── steamrt4.yml │ ├── test/ │ │ └── action.yml │ ├── vixl_simulator.yml │ ├── wine_build/ │ │ └── action.yml │ └── wine_dll_artifacts.yml ├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── CMakeLists.txt ├── CMakeSettings.json ├── CODE_OF_CONDUCT.md ├── CodeEmitter/ │ ├── CMakeLists.txt │ └── CodeEmitter/ │ ├── ALUOps.inl │ ├── ASIMDOps.inl │ ├── BranchOps.inl │ ├── Buffer.h │ ├── Emitter.h │ ├── LoadstoreOps.inl │ ├── Registers.h │ ├── SVEOps.inl │ ├── ScalarOps.inl │ ├── SystemOps.inl │ └── VixlUtils.inl ├── Data/ │ ├── AppConfig/ │ │ ├── CMakeLists.txt │ │ ├── client.json │ │ └── steamwebhelper.json │ ├── CI/ │ │ ├── FEXLinuxTestsThunks.json │ │ ├── GLThunks.json │ │ └── VulkanThunks.json │ ├── CMake/ │ │ ├── FindZycore.cmake │ │ ├── FindZydis.cmake │ │ ├── Findxxhash.cmake │ │ ├── LinkerGC.cmake │ │ ├── cmake_uninstall.cmake.in │ │ ├── toolchain_aarch64.cmake │ │ ├── toolchain_mingw.cmake │ │ ├── toolchain_x86_32.cmake │ │ ├── toolchain_x86_64.cmake │ │ └── version_to_variables.cmake │ ├── Dockerfile │ ├── ThunksDB.json │ ├── binfmts/ │ │ ├── CMakeLists.txt │ │ ├── FEX-x86.conf.in │ │ ├── FEX-x86.in │ │ ├── FEX-x86_64.conf.in │ │ └── FEX-x86_64.in │ └── nix/ │ ├── FEXLinuxTests/ │ │ └── shell.nix │ ├── LibraryForwarding/ │ │ └── shell.nix │ ├── WineOnArm/ │ │ └── shell.nix │ ├── cmake_configure_woa32.sh │ ├── cmake_configure_woa64.sh │ ├── cmake_enable_flt.sh │ └── cmake_enable_libfwd.sh ├── External/ │ ├── .clang-format │ ├── SoftFloat-3e/ │ │ ├── CMakeLists.txt │ │ ├── include/ │ │ │ └── SoftFloat-3e/ │ │ │ ├── opts-GCC.h │ │ │ ├── platform.h │ │ │ ├── primitiveTypes.h │ │ │ ├── softfloat.h │ │ │ └── softfloat_types.h │ │ └── src/ │ │ ├── extF80_add.c │ │ ├── extF80_div.c │ │ ├── extF80_eq.c │ │ ├── extF80_le.c │ │ ├── extF80_lt.c │ │ ├── extF80_mul.c │ │ ├── extF80_rem.c │ │ ├── extF80_roundToInt.c │ │ ├── extF80_sqrt.c │ │ ├── extF80_sub.c │ │ ├── extF80_to_f128.c │ │ ├── extF80_to_f32.c │ │ ├── extF80_to_f64.c │ │ ├── extF80_to_i32.c │ │ ├── extF80_to_i64.c │ │ ├── extF80_to_ui64.c │ │ ├── f128_add.c │ │ ├── f128_div.c │ │ ├── f128_eq.c │ │ ├── f128_eq_signaling.c │ │ ├── f128_isSignalingNaN.c │ │ ├── f128_le.c │ │ ├── f128_le_quiet.c │ │ ├── f128_lt.c │ │ ├── f128_lt_quiet.c │ │ ├── f128_mul.c │ │ ├── f128_mulAdd.c │ │ ├── f128_rem.c │ │ ├── f128_sqrt.c │ │ ├── f128_sub.c │ │ ├── f128_to_extF80.c │ │ ├── f128_to_f16.c │ │ ├── f128_to_f32.c │ │ ├── f128_to_f64.c │ │ ├── f128_to_i32.c │ │ ├── f128_to_i64.c │ │ ├── f128_to_ui32.c │ │ ├── f128_to_ui64.c │ │ ├── f32_to_extF80.c │ │ ├── f32_to_f128.c │ │ ├── f64_to_extF80.c │ │ ├── i32_to_extF80.c │ │ ├── i32_to_f128.c │ │ ├── internals.h │ │ ├── primitives.h │ │ ├── s_add128.c │ │ ├── s_addMagsExtF80.c │ │ ├── s_addMagsF128.c │ │ ├── s_approxRecip32_1.c │ │ ├── s_approxRecipSqrt32_1.c │ │ ├── s_approxRecipSqrt_1Ks.c │ │ ├── s_approxRecip_1Ks.c │ │ ├── s_commonNaNToExtF80UI.c │ │ ├── s_commonNaNToF128UI.c │ │ ├── s_commonNaNToF32UI.c │ │ ├── s_commonNaNToF64UI.c │ │ ├── s_countLeadingZeros32.c │ │ ├── s_countLeadingZeros64.c │ │ ├── s_countLeadingZeros8.c │ │ ├── s_extF80UIToCommonNaN.c │ │ ├── s_f128UIToCommonNaN.c │ │ ├── s_f32UIToCommonNaN.c │ │ ├── s_f64UIToCommonNaN.c │ │ ├── s_le128.c │ │ ├── s_lt128.c │ │ ├── s_mul64ByShifted32To128.c │ │ ├── s_mul64To128.c │ │ ├── s_normRoundPackToExtF80.c │ │ ├── s_normRoundPackToF128.c │ │ ├── s_normSubnormalExtF80Sig.c │ │ ├── s_normSubnormalF128Sig.c │ │ ├── s_normSubnormalF128SigM.c │ │ ├── s_normSubnormalF32Sig.c │ │ ├── s_normSubnormalF64Sig.c │ │ ├── s_propagateNaNExtF80UI.c │ │ ├── s_propagateNaNF128UI.c │ │ ├── s_roundPackToExtF80.c │ │ ├── s_roundPackToF128.c │ │ ├── s_roundPackToF32.c │ │ ├── s_roundPackToF64.c │ │ ├── s_roundToI32.c │ │ ├── s_roundToI64.c │ │ ├── s_roundToUI64.c │ │ ├── s_shiftRightJam128.c │ │ ├── s_shiftRightJam128Extra.c │ │ ├── s_shiftRightJam32.c │ │ ├── s_shiftRightJam64.c │ │ ├── s_shiftRightJam64Extra.c │ │ ├── s_shortShiftLeft128.c │ │ ├── s_shortShiftRight128.c │ │ ├── s_shortShiftRightJam64.c │ │ ├── s_shortShiftRightJam64Extra.c │ │ ├── s_sub128.c │ │ ├── s_subMagsExtF80.c │ │ ├── s_subMagsF128.c │ │ ├── softfloat_raiseFlags.c │ │ ├── specialize.h │ │ └── ui64_to_extF80.c │ ├── cephes/ │ │ ├── CMakeLists.txt │ │ ├── LICENSE │ │ ├── include/ │ │ │ └── cephes_128bit.h │ │ └── src/ │ │ └── 128bit/ │ │ ├── Impl.cpp │ │ ├── atanll.c │ │ ├── constll.c │ │ ├── exp2ll.c │ │ ├── floorll.c │ │ ├── log2ll.c │ │ ├── mconf.h │ │ ├── mtherr.c │ │ ├── polevll.c │ │ ├── sinll.c │ │ └── tanll.c │ ├── code-format-helper/ │ │ ├── code-format-helper.py │ │ ├── requirements_formatting.txt │ │ └── requirements_formatting.txt.in │ └── tiny-json/ │ ├── CMakeLists.txt │ ├── LICENSE │ ├── tiny-json.c │ └── tiny-json.h ├── FEXCore/ │ ├── CMakeLists.txt │ ├── LICENSE │ ├── Readme.md │ ├── Scripts/ │ │ ├── config_generator.py │ │ ├── json_ir_doc_generator.py │ │ └── json_ir_generator.py │ ├── Source/ │ │ ├── CMakeLists.txt │ │ ├── Common/ │ │ │ ├── BitSet.h │ │ │ ├── JitSymbols.cpp │ │ │ ├── JitSymbols.h │ │ │ ├── SoftFloat.h │ │ │ ├── StringConv.h │ │ │ └── VectorRegType.h │ │ ├── Interface/ │ │ │ ├── Config/ │ │ │ │ ├── Config.cpp │ │ │ │ └── Config.json.in │ │ │ ├── Context/ │ │ │ │ ├── Context.cpp │ │ │ │ └── Context.h │ │ │ ├── Core/ │ │ │ │ ├── Addressing.cpp │ │ │ │ ├── Addressing.h │ │ │ │ ├── ArchHelpers/ │ │ │ │ │ ├── Arm64Emitter.cpp │ │ │ │ │ └── Arm64Emitter.h │ │ │ │ ├── CPUBackend.cpp │ │ │ │ ├── CPUBackend.h │ │ │ │ ├── CPUID.cpp │ │ │ │ ├── CPUID.h │ │ │ │ ├── CodeCache.cpp │ │ │ │ ├── Core.cpp │ │ │ │ ├── Dispatcher/ │ │ │ │ │ ├── Dispatcher.cpp │ │ │ │ │ └── Dispatcher.h │ │ │ │ ├── Frontend.cpp │ │ │ │ ├── Frontend.h │ │ │ │ ├── Interpreter/ │ │ │ │ │ ├── Fallbacks/ │ │ │ │ │ │ ├── F80Fallbacks.h │ │ │ │ │ │ ├── FallbackOpHandler.h │ │ │ │ │ │ ├── InterpreterFallbacks.cpp │ │ │ │ │ │ ├── StringCompareFallbacks.cpp │ │ │ │ │ │ └── VectorFallbacks.h │ │ │ │ │ └── InterpreterOps.h │ │ │ │ ├── JIT/ │ │ │ │ │ ├── ALUOps.cpp │ │ │ │ │ ├── Arm64Relocations.cpp │ │ │ │ │ ├── AtomicOps.cpp │ │ │ │ │ ├── BranchOps.cpp │ │ │ │ │ ├── ConversionOps.cpp │ │ │ │ │ ├── DebugData.h │ │ │ │ │ ├── EncryptionOps.cpp │ │ │ │ │ ├── JIT.cpp │ │ │ │ │ ├── JITClass.h │ │ │ │ │ ├── MemoryOps.cpp │ │ │ │ │ ├── MiscOps.cpp │ │ │ │ │ ├── MoveOps.cpp │ │ │ │ │ ├── Relocations.h │ │ │ │ │ └── VectorOps.cpp │ │ │ │ ├── LookupCache.cpp │ │ │ │ ├── LookupCache.h │ │ │ │ ├── OpcodeDispatcher/ │ │ │ │ │ ├── AVX_128.cpp │ │ │ │ │ ├── BaseTables.h │ │ │ │ │ ├── Crypto.cpp │ │ │ │ │ ├── DDDTables.h │ │ │ │ │ ├── Flags.cpp │ │ │ │ │ ├── H0F38Tables.h │ │ │ │ │ ├── H0F3ATables.h │ │ │ │ │ ├── PrimaryGroupTables.h │ │ │ │ │ ├── SecondaryGroupTables.h │ │ │ │ │ ├── SecondaryModRMTables.h │ │ │ │ │ ├── SecondaryTables.h │ │ │ │ │ ├── VEXTables.h │ │ │ │ │ ├── Vector.cpp │ │ │ │ │ ├── X87.cpp │ │ │ │ │ └── X87F64.cpp │ │ │ │ ├── OpcodeDispatcher.cpp │ │ │ │ ├── OpcodeDispatcher.h │ │ │ │ ├── VSyscall/ │ │ │ │ │ └── VSyscall.inc │ │ │ │ └── X86Tables/ │ │ │ │ ├── BaseTables.cpp │ │ │ │ ├── DDDTables.cpp │ │ │ │ ├── H0F38Tables.cpp │ │ │ │ ├── H0F3ATables.cpp │ │ │ │ ├── PrimaryGroupTables.cpp │ │ │ │ ├── SecondaryGroupTables.cpp │ │ │ │ ├── SecondaryModRMTables.cpp │ │ │ │ ├── SecondaryTables.cpp │ │ │ │ ├── VEXTables.cpp │ │ │ │ ├── X86Tables.h │ │ │ │ └── X87Tables.cpp │ │ │ ├── GDBJIT/ │ │ │ │ ├── GDBJIT.cpp │ │ │ │ └── GDBJIT.h │ │ │ └── IR/ │ │ │ ├── IR.h │ │ │ ├── IR.json │ │ │ ├── IRDumper.cpp │ │ │ ├── IREmitter.cpp │ │ │ ├── IREmitter.h │ │ │ ├── IntrusiveIRList.h │ │ │ ├── PassManager.cpp │ │ │ ├── PassManager.h │ │ │ ├── Passes/ │ │ │ │ ├── IRDumperPass.cpp │ │ │ │ ├── IRValidation.cpp │ │ │ │ ├── IRValidation.h │ │ │ │ ├── RedundantFlagCalculationElimination.cpp │ │ │ │ ├── RegisterAllocationPass.cpp │ │ │ │ ├── RegisterAllocationPass.h │ │ │ │ └── x87StackOptimizationPass.cpp │ │ │ ├── Passes.h │ │ │ └── RegisterAllocationData.h │ │ └── Utils/ │ │ ├── Allocator/ │ │ │ ├── 64BitAllocator.cpp │ │ │ ├── FlexBitSet.h │ │ │ ├── HostAllocator.h │ │ │ └── IntrusiveArenaAllocator.h │ │ ├── Allocator.cpp │ │ ├── Allocator.h │ │ ├── AllocatorHooks.cpp │ │ ├── AllocatorOverride.cpp │ │ ├── ArchHelpers/ │ │ │ ├── Arm64.cpp │ │ │ └── Arm64_stubs.cpp │ │ ├── BucketList.h │ │ ├── Config.h │ │ ├── FileLoading.cpp │ │ ├── ForcedAssert.cpp │ │ ├── LogManager.cpp │ │ ├── LongJump.cpp │ │ ├── MemberFunctionToPointer.h │ │ ├── Profiler.cpp │ │ ├── SpinWaitLock.cpp │ │ ├── SpinWaitLock.h │ │ ├── Telemetry.cpp │ │ ├── Threads.cpp │ │ ├── WritePriorityMutex.h │ │ └── variable_length_integer.h │ ├── docs/ │ │ ├── CPUBackends.md │ │ ├── CustomCPUBackend.md │ │ ├── Frontend.md │ │ ├── IR.md │ │ ├── MemoryModelEmulation.md │ │ └── OpDispatcher.md │ ├── include/ │ │ ├── FEXCore/ │ │ │ ├── Config/ │ │ │ │ └── Config.h │ │ │ ├── Core/ │ │ │ │ ├── CPUID.h │ │ │ │ ├── CodeCache.h │ │ │ │ ├── Context.h │ │ │ │ ├── CoreState.h │ │ │ │ ├── HostFeatures.h │ │ │ │ ├── SignalDelegator.h │ │ │ │ ├── Thunks.h │ │ │ │ └── X86Enums.h │ │ │ ├── Debug/ │ │ │ │ ├── GDBReaderInterface.h │ │ │ │ └── InternalThreadState.h │ │ │ ├── HLE/ │ │ │ │ ├── SourcecodeResolver.h │ │ │ │ └── SyscallHandler.h │ │ │ ├── IR/ │ │ │ │ └── IR.h │ │ │ ├── Utils/ │ │ │ │ ├── Allocator.h │ │ │ │ ├── AllocatorHooks.h │ │ │ │ ├── ArchHelpers/ │ │ │ │ │ └── Arm64.h │ │ │ │ ├── CompilerDefs.h │ │ │ │ ├── EnumOperators.h │ │ │ │ ├── EnumUtils.h │ │ │ │ ├── Event.h │ │ │ │ ├── FPState.h │ │ │ │ ├── File.h │ │ │ │ ├── FileLoading.h │ │ │ │ ├── InterruptableConditionVariable.h │ │ │ │ ├── IntervalList.h │ │ │ │ ├── LogManager.h │ │ │ │ ├── LongJump.h │ │ │ │ ├── MathUtils.h │ │ │ │ ├── PrctlUtils.h │ │ │ │ ├── Profiler.h │ │ │ │ ├── SHMStats.h │ │ │ │ ├── SignalScopeGuards.h │ │ │ │ ├── StringUtils.h │ │ │ │ ├── Telemetry.h │ │ │ │ ├── ThreadPoolAllocator.h │ │ │ │ ├── Threads.h │ │ │ │ └── TypeDefines.h │ │ │ └── fextl/ │ │ │ ├── allocator.h │ │ │ ├── deque.h │ │ │ ├── fmt.h │ │ │ ├── forward_list.h │ │ │ ├── functional.h │ │ │ ├── list.h │ │ │ ├── map.h │ │ │ ├── memory.h │ │ │ ├── memory_resource.h │ │ │ ├── queue.h │ │ │ ├── robin_map.h │ │ │ ├── robin_set.h │ │ │ ├── set.h │ │ │ ├── sstream.h │ │ │ ├── stack.h │ │ │ ├── string.h │ │ │ ├── unordered_map.h │ │ │ ├── unordered_set.h │ │ │ └── vector.h │ │ └── git_version.h.in │ └── unittests/ │ ├── APITests/ │ │ ├── Allocator.cpp │ │ ├── CMakeLists.txt │ │ ├── FileLoading.cpp │ │ ├── FlexBitSet.cpp │ │ ├── FutexSpinTest.cpp │ │ ├── ILog2.cpp │ │ └── vl_integer.cpp │ ├── CMakeLists.txt │ └── Emitter/ │ ├── ALU_Tests.cpp │ ├── ASIMD_Tests.cpp │ ├── Branch_Tests.cpp │ ├── CMakeLists.txt │ ├── Loadstore_Tests.cpp │ ├── SVE_Tests.cpp │ ├── Scalar_Tests.cpp │ ├── System_Tests.cpp │ └── TestDisassembler.h ├── FEXHeaderUtils/ │ ├── CMakeLists.txt │ └── FEXHeaderUtils/ │ ├── BitUtils.h │ ├── Filesystem.h │ ├── RingBuffer.h │ ├── StringArgumentParser.h │ ├── SymlinkChecks.h │ └── Syscalls.h ├── LICENSE ├── Readme.md ├── Scripts/ │ ├── CI_FetchRootFS.py │ ├── CheckBinfmtNotInstall.sh │ ├── ClassifyCPU.py │ ├── DefinitionExtract.py │ ├── FEXUpdateAOTIRCache.sh │ ├── GenerateSyscallNumbers.py │ ├── InstallFEX.py │ ├── InstructionCountParser.py │ ├── NeedDisabledSVE.py │ ├── StructPackVerifier.py │ ├── Threaded_Lockstep_Runner.py │ ├── UpdateInstructionCountJson.py │ ├── aarch64_fit_native.py │ ├── changelog_generator.py │ ├── doc_outline_generator.py │ ├── generate_changelog.sh │ ├── generate_doc_outline.sh │ ├── generate_release.sh │ ├── guest_test_runner.py │ ├── json_asm_config_parse.py │ ├── json_config_parse.py │ ├── json_ir_config_parse.py │ ├── reformat.sh │ ├── testharness_runner.py │ └── update_instcountci.sh ├── Source/ │ ├── CMakeLists.txt │ ├── Common/ │ │ ├── ArgumentLoader.cpp │ │ ├── ArgumentLoader.h │ │ ├── Async.h │ │ ├── AsyncNet.h │ │ ├── CMakeLists.txt │ │ ├── CPUInfo.cpp │ │ ├── CPUInfo.h │ │ ├── Config.cpp │ │ ├── Config.h │ │ ├── FDUtils.h │ │ ├── FEXServerClient.cpp │ │ ├── FEXServerClient.h │ │ ├── FileFormatCheck.cpp │ │ ├── FileFormatCheck.h │ │ ├── FileMappingBaseAddress.h │ │ ├── HostFeatures.cpp │ │ ├── HostFeatures.h │ │ ├── JSONPool.cpp │ │ ├── JSONPool.h │ │ ├── Linux/ │ │ │ ├── SBRKAllocations.cpp │ │ │ └── SBRKAllocations.h │ │ ├── SHMStats.cpp │ │ ├── SHMStats.h │ │ ├── VolatileMetadata.cpp │ │ ├── VolatileMetadata.h │ │ └── X86Features.h │ ├── Steam/ │ │ ├── CMakeLists.txt │ │ ├── CompatTool.cpp │ │ ├── ConfigTemplate.json │ │ ├── ServerManager.cpp │ │ ├── VERSIONS.txt.in │ │ ├── emulator.json │ │ └── toolmanifest.vdf │ ├── Tools/ │ │ ├── CMakeLists.txt │ │ ├── CodeSizeValidation/ │ │ │ ├── CMakeLists.txt │ │ │ └── Main.cpp │ │ ├── CommonTools/ │ │ │ ├── CMakeLists.txt │ │ │ ├── CodeLoader.h │ │ │ ├── DummyHandlers.cpp │ │ │ ├── DummyHandlers.h │ │ │ ├── HarnessHelpers.h │ │ │ ├── Linux/ │ │ │ │ └── Utils/ │ │ │ │ ├── ELFContainer.cpp │ │ │ │ ├── ELFContainer.h │ │ │ │ └── ELFParser.h │ │ │ └── PortabilityInfo.h │ │ ├── FEXBash/ │ │ │ ├── CMakeLists.txt │ │ │ └── FEXBash.cpp │ │ ├── FEXConfig/ │ │ │ ├── CMakeLists.txt │ │ │ ├── Main.cpp │ │ │ ├── Main.h │ │ │ ├── main.qml │ │ │ ├── qml5.qrc │ │ │ ├── qml6.qrc │ │ │ ├── qt5/ │ │ │ │ ├── FileDialog.qml │ │ │ │ ├── FolderDialog.qml │ │ │ │ └── MessageDialog.qml │ │ │ └── qt6/ │ │ │ ├── FileDialog.qml │ │ │ ├── FolderDialog.qml │ │ │ └── MessageDialog.qml │ │ ├── FEXGDBReader/ │ │ │ ├── CMakeLists.txt │ │ │ └── FEXGDBReader.cpp │ │ ├── FEXGetConfig/ │ │ │ ├── CMakeLists.txt │ │ │ └── Main.cpp │ │ ├── FEXInterpreter/ │ │ │ ├── AOT/ │ │ │ │ ├── AOTGenerator.cpp │ │ │ │ └── AOTGenerator.h │ │ │ ├── CMakeLists.txt │ │ │ ├── ELFCodeLoader.h │ │ │ └── FEXInterpreter.cpp │ │ ├── FEXOfflineCompiler/ │ │ │ ├── CMakeLists.txt │ │ │ └── Main.cpp │ │ ├── FEXRootFSFetcher/ │ │ │ ├── CMakeLists.txt │ │ │ ├── Main.cpp │ │ │ ├── XXFileHash.cpp │ │ │ └── XXFileHash.h │ │ ├── FEXServer/ │ │ │ ├── ArgumentLoader.cpp │ │ │ ├── ArgumentLoader.h │ │ │ ├── CMakeLists.txt │ │ │ ├── Logger.cpp │ │ │ ├── Logger.h │ │ │ ├── Main.cpp │ │ │ ├── PipeScanner.cpp │ │ │ ├── PipeScanner.h │ │ │ ├── ProcessPipe.cpp │ │ │ ├── ProcessPipe.h │ │ │ ├── SquashFS.cpp │ │ │ └── SquashFS.h │ │ ├── LinuxEmulation/ │ │ │ ├── ArchHelpers/ │ │ │ │ ├── MContext.cpp │ │ │ │ ├── MContext.h │ │ │ │ ├── UContext.h │ │ │ │ └── WinContext.h │ │ │ ├── CMakeLists.txt │ │ │ ├── GdbServer/ │ │ │ │ ├── Info.cpp │ │ │ │ └── Info.h │ │ │ ├── LinuxSyscalls/ │ │ │ │ ├── Arm64/ │ │ │ │ │ └── SyscallsEnum.h │ │ │ │ ├── EmulatedFiles/ │ │ │ │ │ ├── EmulatedFiles.cpp │ │ │ │ │ └── EmulatedFiles.h │ │ │ │ ├── FaultSafeUserMemAccess.cpp │ │ │ │ ├── FileManagement.cpp │ │ │ │ ├── FileManagement.h │ │ │ │ ├── GdbServer.cpp │ │ │ │ ├── GdbServer.h │ │ │ │ ├── LinuxAllocator.cpp │ │ │ │ ├── LinuxAllocator.h │ │ │ │ ├── Seccomp/ │ │ │ │ │ ├── BPFEmitter.cpp │ │ │ │ │ ├── BPFEmitter.h │ │ │ │ │ ├── Dumper.cpp │ │ │ │ │ ├── SeccompEmulator.cpp │ │ │ │ │ └── SeccompEmulator.h │ │ │ │ ├── SignalDelegator/ │ │ │ │ │ └── GuestFramesManagement.cpp │ │ │ │ ├── SignalDelegator.cpp │ │ │ │ ├── SignalDelegator.h │ │ │ │ ├── Syscalls/ │ │ │ │ │ ├── EPoll.cpp │ │ │ │ │ ├── FD.cpp │ │ │ │ │ ├── FS.cpp │ │ │ │ │ ├── IO.cpp │ │ │ │ │ ├── Info.cpp │ │ │ │ │ ├── Memory.cpp │ │ │ │ │ ├── NotImplemented.cpp │ │ │ │ │ ├── Passthrough.cpp │ │ │ │ │ ├── Signals.cpp │ │ │ │ │ ├── Stubs.cpp │ │ │ │ │ ├── Thread.cpp │ │ │ │ │ ├── Thread.h │ │ │ │ │ └── Timer.cpp │ │ │ │ ├── Syscalls.cpp │ │ │ │ ├── Syscalls.h │ │ │ │ ├── SyscallsSMCTracking.cpp │ │ │ │ ├── SyscallsVMATracking.cpp │ │ │ │ ├── SyscallsVMATracking.h │ │ │ │ ├── ThreadManager.cpp │ │ │ │ ├── ThreadManager.h │ │ │ │ ├── Types.h │ │ │ │ ├── Utils/ │ │ │ │ │ ├── Threads.cpp │ │ │ │ │ └── Threads.h │ │ │ │ ├── x32/ │ │ │ │ │ ├── EPoll.cpp │ │ │ │ │ ├── FD.cpp │ │ │ │ │ ├── FS.cpp │ │ │ │ │ ├── IO.cpp │ │ │ │ │ ├── Info.cpp │ │ │ │ │ ├── Ioctl/ │ │ │ │ │ │ ├── HelperDefines.h │ │ │ │ │ │ ├── amdgpu_drm.inl │ │ │ │ │ │ ├── asahi_drm.inl │ │ │ │ │ │ ├── asound.h │ │ │ │ │ │ ├── asound.inl │ │ │ │ │ │ ├── drm.h │ │ │ │ │ │ ├── drm.inl │ │ │ │ │ │ ├── ext_fs.h │ │ │ │ │ │ ├── ext_fs.inl │ │ │ │ │ │ ├── f2fs.h │ │ │ │ │ │ ├── f2fs.inl │ │ │ │ │ │ ├── i915_drm.inl │ │ │ │ │ │ ├── input.h │ │ │ │ │ │ ├── input.inl │ │ │ │ │ │ ├── joystick.h │ │ │ │ │ │ ├── joystick.inl │ │ │ │ │ │ ├── lima_drm.inl │ │ │ │ │ │ ├── msdos_fs.h │ │ │ │ │ │ ├── msdos_fs.inl │ │ │ │ │ │ ├── msm_drm.inl │ │ │ │ │ │ ├── nouveau_drm.inl │ │ │ │ │ │ ├── nova_drm.inl │ │ │ │ │ │ ├── panfrost_drm.inl │ │ │ │ │ │ ├── panthor_drm.inl │ │ │ │ │ │ ├── pvr_drm.inl │ │ │ │ │ │ ├── radeon_drm.inl │ │ │ │ │ │ ├── sockios.h │ │ │ │ │ │ ├── sockios.inl │ │ │ │ │ │ ├── streams.h │ │ │ │ │ │ ├── streams.inl │ │ │ │ │ │ ├── usbdev.h │ │ │ │ │ │ ├── usbdev.inl │ │ │ │ │ │ ├── v3d_drm.inl │ │ │ │ │ │ ├── v4l2.h │ │ │ │ │ │ ├── v4l2.inl │ │ │ │ │ │ ├── vc4_drm.inl │ │ │ │ │ │ ├── virtio_drm.inl │ │ │ │ │ │ ├── wireless.h │ │ │ │ │ │ ├── wireless.inl │ │ │ │ │ │ └── xe_drm.inl │ │ │ │ │ ├── IoctlEmulation.cpp │ │ │ │ │ ├── IoctlEmulation.h │ │ │ │ │ ├── Ioctls.inl │ │ │ │ │ ├── Memory.cpp │ │ │ │ │ ├── Msg.cpp │ │ │ │ │ ├── NotImplemented.cpp │ │ │ │ │ ├── Sched.cpp │ │ │ │ │ ├── Semaphore.cpp │ │ │ │ │ ├── Signals.cpp │ │ │ │ │ ├── Socket.cpp │ │ │ │ │ ├── Stubs.cpp │ │ │ │ │ ├── Syscalls.cpp │ │ │ │ │ ├── Syscalls.h │ │ │ │ │ ├── SyscallsEnum.h │ │ │ │ │ ├── SyscallsNames.inl │ │ │ │ │ ├── Thread.cpp │ │ │ │ │ ├── Thread.h │ │ │ │ │ ├── Time.cpp │ │ │ │ │ ├── Timer.cpp │ │ │ │ │ └── Types.h │ │ │ │ └── x64/ │ │ │ │ ├── EPoll.cpp │ │ │ │ ├── FD.cpp │ │ │ │ ├── Info.cpp │ │ │ │ ├── Ioctl/ │ │ │ │ │ ├── HelperDefines.h │ │ │ │ │ ├── amdgpu_drm.inl │ │ │ │ │ ├── asound.h │ │ │ │ │ ├── asound.inl │ │ │ │ │ ├── drm.h │ │ │ │ │ ├── drm.inl │ │ │ │ │ ├── ext_fs.h │ │ │ │ │ ├── ext_fs.inl │ │ │ │ │ ├── f2fs.h │ │ │ │ │ ├── f2fs.inl │ │ │ │ │ ├── input.h │ │ │ │ │ ├── input.inl │ │ │ │ │ ├── joystick.h │ │ │ │ │ ├── joystick.inl │ │ │ │ │ ├── msdos_fs.h │ │ │ │ │ ├── msdos_fs.inl │ │ │ │ │ ├── msm_drm.inl │ │ │ │ │ ├── sockios.h │ │ │ │ │ ├── sockios.inl │ │ │ │ │ ├── wireless.h │ │ │ │ │ └── wireless.inl │ │ │ │ ├── Memory.cpp │ │ │ │ ├── NotImplemented.cpp │ │ │ │ ├── Semaphore.cpp │ │ │ │ ├── Signals.cpp │ │ │ │ ├── Syscalls.cpp │ │ │ │ ├── Syscalls.h │ │ │ │ ├── SyscallsEnum.h │ │ │ │ ├── SyscallsNames.inl │ │ │ │ ├── Thread.cpp │ │ │ │ ├── Thread.h │ │ │ │ ├── Time.cpp │ │ │ │ └── Types.h │ │ │ ├── Thunks.cpp │ │ │ ├── Thunks.h │ │ │ ├── VDSO_Emulation.cpp │ │ │ └── VDSO_Emulation.h │ │ ├── TestHarnessRunner/ │ │ │ ├── CMakeLists.txt │ │ │ ├── TestHarnessRunner/ │ │ │ │ ├── HostRunner.cpp │ │ │ │ └── HostRunner.h │ │ │ └── TestHarnessRunner.cpp │ │ └── pidof/ │ │ ├── CMakeLists.txt │ │ └── pidof.cpp │ └── Windows/ │ ├── ARM64EC/ │ │ ├── BTInterface.h │ │ ├── CMakeLists.txt │ │ ├── Module.S │ │ ├── Module.cpp │ │ └── libarm64ecfex.def │ ├── CMakeLists.txt │ ├── Common/ │ │ ├── CMakeLists.txt │ │ ├── CPUFeatures.cpp │ │ ├── CPUFeatures.h │ │ ├── CRT/ │ │ │ ├── Alloc.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── CRT.cpp │ │ │ ├── CRT.h │ │ │ ├── IO.cpp │ │ │ ├── Math.cpp │ │ │ ├── Misc.cpp │ │ │ ├── Musl/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── __math_divzero.c │ │ │ │ ├── __math_invalid.c │ │ │ │ ├── __math_oflow.c │ │ │ │ ├── __math_uflow.c │ │ │ │ ├── __math_xflow.c │ │ │ │ ├── exp2.c │ │ │ │ ├── exp_data.c │ │ │ │ ├── exp_data.h │ │ │ │ ├── fmod.c │ │ │ │ ├── isnan.c │ │ │ │ ├── libm.h │ │ │ │ ├── log2.c │ │ │ │ ├── log2_data.c │ │ │ │ ├── log2_data.h │ │ │ │ ├── remainder.c │ │ │ │ ├── remquo.c │ │ │ │ ├── strtoimax.c │ │ │ │ ├── strtoll.c │ │ │ │ ├── strtoull.c │ │ │ │ └── strtoumax.c │ │ │ └── String.cpp │ │ ├── CallRetStack.h │ │ ├── Exception.h │ │ ├── Handle.h │ │ ├── ImageTracker.cpp │ │ ├── ImageTracker.h │ │ ├── InvalidationTracker.cpp │ │ ├── InvalidationTracker.h │ │ ├── JITGuardPage.h │ │ ├── LoadConfig.S │ │ ├── Logging.cpp │ │ ├── Logging.h │ │ ├── Module.h │ │ ├── OvercommitTracker.h │ │ ├── PortabilityInfo.h │ │ ├── Priv.h │ │ ├── SHMStats.cpp │ │ ├── SHMStats.h │ │ ├── TSOHandlerConfig.h │ │ └── WinAPI/ │ │ ├── Alloc.cpp │ │ ├── CMakeLists.txt │ │ ├── IO.cpp │ │ ├── Misc.cpp │ │ └── Sync.cpp │ ├── Defs/ │ │ ├── ntdll.def │ │ └── wow64.def │ ├── WOW64/ │ │ ├── BTInterface.h │ │ ├── CMakeLists.txt │ │ ├── Module.cpp │ │ └── libwow64fex.def │ └── include/ │ ├── wine/ │ │ ├── debug.h │ │ └── unixlib.h │ ├── winnt.h │ └── winternl.h ├── ThunkLibs/ │ ├── Generator/ │ │ ├── CMakeLists.txt │ │ ├── analysis.cpp │ │ ├── analysis.h │ │ ├── data_layout.cpp │ │ ├── data_layout.h │ │ ├── diagnostics.h │ │ ├── gen.cpp │ │ ├── interface.h │ │ └── main.cpp │ ├── GuestLibs/ │ │ └── CMakeLists.txt │ ├── HostLibs/ │ │ └── CMakeLists.txt │ ├── README.md │ ├── include/ │ │ └── common/ │ │ ├── GeneratorInterface.h │ │ ├── Guest.h │ │ ├── Host.h │ │ ├── PackedArguments.h │ │ └── X11Manager.h │ ├── libEGL/ │ │ ├── libEGL_Guest.cpp │ │ ├── libEGL_Host.cpp │ │ └── libEGL_interface.cpp │ ├── libGL/ │ │ ├── glcorearb.h │ │ ├── libGL_Guest.cpp │ │ ├── libGL_Host.cpp │ │ └── libGL_interface.cpp │ ├── libSDL2/ │ │ ├── libSDL2_Guest.cpp │ │ └── libSDL2_Host.cpp │ ├── libVDSO/ │ │ ├── Types.h │ │ ├── libVDSO_Guest.cpp │ │ ├── libVDSO_Guest.lds │ │ ├── libVDSO_Guest_32.lds │ │ └── libVDSO_interface.cpp │ ├── libX11/ │ │ └── libX11_NativeGuest.cpp │ ├── libasound/ │ │ ├── libasound_Guest.cpp │ │ ├── libasound_Host.cpp │ │ └── libasound_interface.cpp │ ├── libdrm/ │ │ ├── Guest.cpp │ │ ├── Host.cpp │ │ └── libdrm_interface.cpp │ ├── libfex_malloc/ │ │ ├── Guest.cpp │ │ ├── Host.cpp │ │ └── Types.h │ ├── libfex_malloc_loader/ │ │ └── Guest.cpp │ ├── libfex_malloc_symbols/ │ │ └── Host.cpp │ ├── libfex_thunk_test/ │ │ ├── Guest.cpp │ │ ├── Host.cpp │ │ ├── api.h │ │ ├── lib.cpp │ │ └── libfex_thunk_test_interface.cpp │ ├── libvulkan/ │ │ ├── Guest.cpp │ │ ├── Host.cpp │ │ └── libvulkan_interface.cpp │ ├── libwayland-client/ │ │ ├── Guest.cpp │ │ ├── Host.cpp │ │ └── libwayland-client_interface.cpp │ └── libxshmfence/ │ ├── Guest.cpp │ ├── Host.cpp │ └── libxshmfence_interface.cpp ├── docs/ │ ├── CPUID.md │ ├── DeferredSignals.md │ ├── ProgrammingConcerns.md │ ├── Readme_CN.md │ ├── ReleaseProcess.md │ ├── SourceOutline.md │ └── allocator_usage.md └── unittests/ ├── 32Bit_ASM/ │ ├── CMakeLists.txt │ ├── Disabled_Tests │ ├── Disabled_Tests_Simulator │ ├── Disabled_Tests_host │ ├── FEX_bugs/ │ │ ├── GOT_calculation.asm │ │ ├── IMUL_garbagedata.asm │ │ ├── InlineSyscall.asm │ │ ├── InvertedCarrySet.asm │ │ ├── LoopAddressSizeCheck.asm │ │ ├── SignExtendBug.asm │ │ ├── SubAddrBug.asm │ │ ├── TelemetryFlags.asm │ │ ├── VEXW_Bug.asm │ │ ├── adc.asm │ │ ├── rep_lods_bug.asm │ │ └── x87_unordered_cmp_fix_32.asm │ ├── Known_Failures │ ├── Primary/ │ │ ├── Loops.asm │ │ ├── Pop_Segments.asm │ │ ├── Primary_00.asm │ │ ├── Primary_00_2.asm │ │ ├── Primary_00_3.asm │ │ ├── Primary_27.asm │ │ ├── Primary_2F.asm │ │ ├── Primary_37.asm │ │ ├── Primary_3F.asm │ │ ├── Primary_60.asm │ │ ├── Primary_60_2.asm │ │ ├── Primary_61.asm │ │ ├── Primary_61_2.asm │ │ ├── Primary_8C.asm │ │ ├── Primary_8C_2.asm │ │ ├── Primary_8D.asm │ │ ├── Primary_A0.asm │ │ ├── Primary_A2.asm │ │ ├── Primary_A6.asm │ │ ├── Primary_A6_REP.asm │ │ ├── Primary_A6_REPNE.asm │ │ ├── Primary_A6_REPNE_Equal.asm │ │ ├── Primary_A6_REP_Equal.asm │ │ ├── Primary_A6_REP_Smaller.asm │ │ ├── Primary_A6_REP_down.asm │ │ ├── Primary_A6_REP_down_Equal.asm │ │ ├── Primary_A6_down.asm │ │ ├── Primary_A7_dword.asm │ │ ├── Primary_A7_dword_down.asm │ │ ├── Primary_A7_word.asm │ │ ├── Primary_A7_word_down.asm │ │ ├── Primary_AE.asm │ │ ├── Primary_AE_REP.asm │ │ ├── Primary_AE_REPNE.asm │ │ ├── Primary_AE_REPNE_down.asm │ │ ├── Primary_AE_REP_down.asm │ │ ├── Primary_AF_REP_dword.asm │ │ ├── Primary_AF_REP_word.asm │ │ ├── Primary_C9.asm │ │ ├── Primary_CE.asm │ │ ├── Primary_CF.asm │ │ ├── Primary_D4.asm │ │ ├── Primary_D5.asm │ │ ├── Primary_D6.asm │ │ ├── Primary_E3.asm │ │ ├── Primary_E8.asm │ │ ├── Primary_E8_2.asm │ │ ├── Primary_E9.asm │ │ ├── Primary_E9_2.asm │ │ └── Push_Segments.asm │ ├── PrimaryGroup/ │ │ ├── 3_F6_05.asm │ │ ├── 5_FF_02.asm │ │ ├── 5_FF_02_2.asm │ │ └── 5_FF_02_3.asm │ ├── Secondary/ │ │ ├── 07_XX_00.asm │ │ ├── 07_XX_04.asm │ │ └── 15_XX_0.asm │ ├── SecondaryModRM/ │ │ ├── Reg_7_1.asm │ │ └── Reg_7_4_2.asm │ ├── TwoByte/ │ │ ├── 0F_82.asm │ │ └── 0F_82_2.asm │ ├── VEX/ │ │ ├── vgather_qpd_128bit_1xdisp_overflow.asm │ │ ├── vgather_qpd_128bit_2xdisp_overflow.asm │ │ ├── vgather_qpd_128bit_4xdisp_overflow.asm │ │ ├── vgather_qpd_128bit_8xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_1xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_2xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_4xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_8xdisp_overflow.asm │ │ ├── vgather_qps_128bit_1xdisp_overflow.asm │ │ ├── vgather_qps_128bit_2xdisp_overflow.asm │ │ ├── vgather_qps_128bit_4xdisp_overflow.asm │ │ ├── vgather_qps_128bit_8xdisp_overflow.asm │ │ ├── vgather_qps_256bit_1xdisp_overflow.asm │ │ ├── vgather_qps_256bit_2xdisp_overflow.asm │ │ ├── vgather_qps_256bit_4xdisp_overflow.asm │ │ ├── vgather_qps_256bit_8xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_1xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_2xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_4xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_8xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_1xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_2xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_4xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_8xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_1xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_2xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_4xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_8xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_1xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_2xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_4xdisp_overflow.asm │ │ └── vpgather_qq_256bit_8xdisp_overflow.asm │ ├── X87/ │ │ ├── D8_00.asm │ │ ├── D8_01.asm │ │ ├── D8_04.asm │ │ ├── D8_05.asm │ │ ├── D8_06.asm │ │ ├── D8_07.asm │ │ ├── D8_C0.asm │ │ ├── D8_C8.asm │ │ ├── D8_E0.asm │ │ ├── D8_E8.asm │ │ ├── D8_F0.asm │ │ ├── D8_F0_2.asm │ │ ├── D8_F8.asm │ │ ├── D9_00.asm │ │ ├── D9_02.asm │ │ ├── D9_03.asm │ │ ├── D9_05.asm │ │ ├── D9_06.asm │ │ ├── D9_06_2.asm │ │ ├── D9_07.asm │ │ ├── D9_C0.asm │ │ ├── D9_C8.asm │ │ ├── D9_D0.asm │ │ ├── D9_E0.asm │ │ ├── D9_E1.asm │ │ ├── D9_E8.asm │ │ ├── D9_E9.asm │ │ ├── D9_EA.asm │ │ ├── D9_EB.asm │ │ ├── D9_EC.asm │ │ ├── D9_ED.asm │ │ ├── D9_EE.asm │ │ ├── D9_F0.asm │ │ ├── D9_F1.asm │ │ ├── D9_F2.asm │ │ ├── D9_F3.asm │ │ ├── D9_F4.asm │ │ ├── D9_F5.asm │ │ ├── D9_F6.asm │ │ ├── D9_F7.asm │ │ ├── D9_F8.asm │ │ ├── D9_F9.asm │ │ ├── D9_FA.asm │ │ ├── D9_FB.asm │ │ ├── D9_FC.asm │ │ ├── D9_FD.asm │ │ ├── D9_FE.asm │ │ ├── D9_FF.asm │ │ ├── DA_00.asm │ │ ├── DA_01.asm │ │ ├── DA_04.asm │ │ ├── DA_05.asm │ │ ├── DA_06.asm │ │ ├── DA_07.asm │ │ ├── DA_C0.asm │ │ ├── DA_C8.asm │ │ ├── DA_D0.asm │ │ ├── DA_D8.asm │ │ ├── DB_00.asm │ │ ├── DB_01.asm │ │ ├── DB_02.asm │ │ ├── DB_03.asm │ │ ├── DB_05.asm │ │ ├── DB_07.asm │ │ ├── DB_C0.asm │ │ ├── DB_C8.asm │ │ ├── DB_D0.asm │ │ ├── DB_D8.asm │ │ ├── DB_E2.asm │ │ ├── DB_E3.asm │ │ ├── DB_E3_2.asm │ │ ├── DC_00.asm │ │ ├── DC_01.asm │ │ ├── DC_04.asm │ │ ├── DC_05.asm │ │ ├── DC_06.asm │ │ ├── DC_07.asm │ │ ├── DC_C0.asm │ │ ├── DC_C8.asm │ │ ├── DC_E0.asm │ │ ├── DC_E8.asm │ │ ├── DC_F0.asm │ │ ├── DC_F8.asm │ │ ├── DD_00.asm │ │ ├── DD_01.asm │ │ ├── DD_02.asm │ │ ├── DD_03.asm │ │ ├── DD_04.asm │ │ ├── DD_04_2.asm │ │ ├── DD_07.asm │ │ ├── DD_C0.asm │ │ ├── DD_D0.asm │ │ ├── DD_D8.asm │ │ ├── DE_00.asm │ │ ├── DE_01.asm │ │ ├── DE_04.asm │ │ ├── DE_05.asm │ │ ├── DE_06.asm │ │ ├── DE_07.asm │ │ ├── DE_C0.asm │ │ ├── DE_C8.asm │ │ ├── DE_E0.asm │ │ ├── DE_E8.asm │ │ ├── DE_F0.asm │ │ ├── DE_F8.asm │ │ ├── DF_00.asm │ │ ├── DF_01.asm │ │ ├── DF_02.asm │ │ ├── DF_03.asm │ │ ├── DF_05.asm │ │ ├── DF_07.asm │ │ ├── DF_C0.asm │ │ ├── DF_E0.asm │ │ ├── FST_AddrModes.asm │ │ ├── RoundingNeg.asm │ │ ├── RoundingPos.asm │ │ ├── invalid_div_zero.asm │ │ ├── invalid_fcos_infinity.asm │ │ ├── invalid_fist_nan.asm │ │ ├── invalid_fist_overflow.asm │ │ ├── invalid_fist_overflow_16bit.asm │ │ ├── invalid_fist_overflow_32bit.asm │ │ ├── invalid_fist_overflow_64bit.asm │ │ ├── invalid_fprem_infinity.asm │ │ ├── invalid_fptan_infinity.asm │ │ ├── invalid_fsin_infinity.asm │ │ ├── invalid_fsincos_infinity.asm │ │ ├── invalid_infinity_fsub_memory.asm │ │ ├── invalid_infinity_fsubr_infinity.asm │ │ ├── invalid_infinity_mul_zero.asm │ │ ├── invalid_infinity_ops.asm │ │ ├── invalid_infinity_sub_infinity.asm │ │ ├── invalid_neg_infinity_sub_neg_infinity.asm │ │ ├── invalid_reduced_precision.asm │ │ ├── invalid_simple_test.asm │ │ ├── invalid_sqrt_negative.asm │ │ ├── valid_fist_16bit.asm │ │ └── valid_operation.asm │ ├── arpl.asm │ └── arpl_2.asm ├── APITests/ │ ├── Allocator.cpp │ ├── ArgumentParser.cpp │ ├── CMakeLists.txt │ ├── ExtendedVolatileMetadata.cpp │ ├── FileMappingBaseAddress.cpp │ ├── Filesystem.cpp │ ├── InterruptableConditionVariable.cpp │ ├── StringUtils.cpp │ └── fextl_function.cpp ├── ASM/ │ ├── 3DNow/ │ │ ├── 0C.asm │ │ ├── 0D.asm │ │ ├── 0E.asm │ │ ├── 1C.asm │ │ ├── 1D.asm │ │ ├── 86.asm │ │ ├── 87.asm │ │ ├── 8A.asm │ │ ├── 8E.asm │ │ ├── 90.asm │ │ ├── 94.asm │ │ ├── 96.asm │ │ ├── 97.asm │ │ ├── 9A.asm │ │ ├── 9E.asm │ │ ├── A0.asm │ │ ├── A4.asm │ │ ├── A6.asm │ │ ├── A7.asm │ │ ├── AA.asm │ │ ├── AE.asm │ │ ├── B0.asm │ │ ├── B4.asm │ │ ├── B6.asm │ │ ├── B7.asm │ │ ├── BB.asm │ │ └── BF.asm │ ├── Atomics/ │ │ ├── adc_atomic16.asm │ │ ├── adc_atomic32.asm │ │ ├── adc_atomic64.asm │ │ ├── neg_atomic16.asm │ │ ├── neg_atomic32.asm │ │ ├── neg_atomic64.asm │ │ ├── not_atomic16.asm │ │ ├── not_atomic32.asm │ │ ├── not_atomic64.asm │ │ ├── sbb_atomic16.asm │ │ ├── sbb_atomic32.asm │ │ └── sbb_atomic64.asm │ ├── CALL.asm │ ├── CMakeLists.txt │ ├── ConstProp/ │ │ └── ConstPooling.asm │ ├── DAZTest.asm │ ├── Disabled_Tests │ ├── Disabled_Tests_ARMv8.0 │ ├── Disabled_Tests_ARMv8.2 │ ├── Disabled_Tests_ARMv8.4 │ ├── Disabled_Tests_Simulator │ ├── Disabled_Tests_host │ ├── Disabled_Tests_x64 │ ├── Displacement_Encoding.asm │ ├── FEX_bugs/ │ │ ├── 32bit_syscall.asm │ │ ├── 3DNow_ModRMSIBDecode.asm │ │ ├── BEXTR_flags.asm │ │ ├── BLSI_flags.asm │ │ ├── BLSMSK_flags.asm │ │ ├── BLSR_flags.asm │ │ ├── BT_flags.asm │ │ ├── BZHI_Sign.asm │ │ ├── BitConditionCheck.asm │ │ ├── Blake3.asm │ │ ├── BranchConditionCheck.asm │ │ ├── CodeBufferOverflow.asm │ │ ├── Divide32.asm │ │ ├── H0F3AREXBug.asm │ │ ├── IMUL_garbagedata_negative.asm │ │ ├── InitialPFFlag.asm │ │ ├── LargeRotatesForSmallSizes.asm │ │ ├── LargeRotatesForSmallSizes_More.asm │ │ ├── LoadAtBoundary_LowerPrecision.asm │ │ ├── LongSignedDivide.asm │ │ ├── LoopAddressSizeCheck.asm │ │ ├── MinMaxNaN.asm │ │ ├── MoveMerging.asm │ │ ├── NegativeCallAddressSizeOverride.asm │ │ ├── OptSizeConfusion.asm │ │ ├── PSRLDQBuf.asm │ │ ├── Push.asm │ │ ├── REX/ │ │ │ ├── 0F_38.asm │ │ │ ├── 0F_3A.asm │ │ │ ├── DDDNow.asm │ │ │ ├── Primary.asm │ │ │ ├── Primary_2.asm │ │ │ └── TwoByte.asm │ │ ├── RegCacheMMX.asm │ │ ├── SBCSmall.asm │ │ ├── SHRD_OF.asm │ │ ├── SIBScaleTranspose.asm │ │ ├── SegmentAddressOverride.asm │ │ ├── SelfPop.asm │ │ ├── ShiftConstantBug.asm │ │ ├── ShiftPF.asm │ │ ├── ShiftZeroFlagsUpdate.asm │ │ ├── SmallShiftFlags.asm │ │ ├── Test_CmpSelect_Merge.asm │ │ ├── Test_CmpSelect_Merge_Float.asm │ │ ├── Test_CmpSelect_Merge_Float_branch.asm │ │ ├── Test_CmpSelect_Merge_branch.asm │ │ ├── Test_JP.asm │ │ ├── Test_PF_Zero_Shift.asm │ │ ├── TrickyRA.asm │ │ ├── UnalignedLoadStoreSIGBUS.asm │ │ ├── VectorLoadCrash.asm │ │ ├── VectorShift_zero.asm │ │ ├── VectorShift_zero_256.asm │ │ ├── VectorShift_zero_avx_128.asm │ │ ├── X87MMXNZCV.asm │ │ ├── XeSS_quadratic.asm │ │ ├── adcx_size.asm │ │ ├── add_sub_carry.asm │ │ ├── add_sub_carry_2.asm │ │ ├── cmpxchg.asm │ │ ├── fnsave_fnrstor_size.asm │ │ ├── fxrstor_bug.asm │ │ ├── fxsave_bug.asm │ │ ├── issue5084_crossblock_const.asm │ │ ├── mmx_x87_register_conflating.asm │ │ ├── mov_address_size_override.asm │ │ ├── non_fatal_syscall.asm │ │ ├── nzcv_implicit_clobber.asm │ │ ├── nzcv_rmw.asm │ │ ├── nzcv_spill_enderlilies.asm │ │ ├── overlapping_memcpy_bug.asm │ │ ├── pcmpestri_garbage_rcx.asm │ │ ├── repeat_on_incdec.asm │ │ ├── repeat_stringops_crash.asm │ │ ├── rex_b_mmx.asm │ │ ├── rotate_zero_extend_with_zero.asm │ │ ├── sbbNZCVBug.asm │ │ ├── smallvectorload_regreg.asm │ │ ├── tls_vector_element.asm │ │ ├── vcvtdq2ps_incorrect_size.asm │ │ ├── vgather_xmm4.asm │ │ ├── vmov_size_test.asm │ │ ├── vroundscalar_sve256.asm │ │ ├── x87DecrementStackBug.asm │ │ ├── x87IncrementStackBug.asm │ │ ├── x87_fprem.asm │ │ ├── x87_integer_indefinite.asm │ │ ├── x87_unordered_cmp_fix.asm │ │ ├── xor_flags.asm │ │ └── zero-ah.asm │ ├── Flags/ │ │ └── Shift.asm │ ├── GameTests/ │ │ └── EnderLiliesFlash.asm │ ├── H0F38/ │ │ ├── 0_F0.asm │ │ ├── 0_F1.asm │ │ ├── 66_00.asm │ │ ├── 66_00_2.asm │ │ ├── 66_01.asm │ │ ├── 66_02.asm │ │ ├── 66_03.asm │ │ ├── 66_04.asm │ │ ├── 66_05.asm │ │ ├── 66_06.asm │ │ ├── 66_07.asm │ │ ├── 66_08.asm │ │ ├── 66_09.asm │ │ ├── 66_0A.asm │ │ ├── 66_0B.asm │ │ ├── 66_10.asm │ │ ├── 66_14.asm │ │ ├── 66_15.asm │ │ ├── 66_17.asm │ │ ├── 66_17_2.asm │ │ ├── 66_1C.asm │ │ ├── 66_1D.asm │ │ ├── 66_1E.asm │ │ ├── 66_20.asm │ │ ├── 66_21.asm │ │ ├── 66_22.asm │ │ ├── 66_23.asm │ │ ├── 66_24.asm │ │ ├── 66_25.asm │ │ ├── 66_28.asm │ │ ├── 66_29.asm │ │ ├── 66_2A.asm │ │ ├── 66_2B.asm │ │ ├── 66_30.asm │ │ ├── 66_31.asm │ │ ├── 66_32.asm │ │ ├── 66_33.asm │ │ ├── 66_34.asm │ │ ├── 66_35.asm │ │ ├── 66_37.asm │ │ ├── 66_38.asm │ │ ├── 66_39.asm │ │ ├── 66_3A.asm │ │ ├── 66_3B.asm │ │ ├── 66_3C.asm │ │ ├── 66_3D.asm │ │ ├── 66_3E.asm │ │ ├── 66_3F.asm │ │ ├── 66_40.asm │ │ ├── 66_41.asm │ │ ├── 66_DB.asm │ │ ├── 66_DC.asm │ │ ├── 66_DD.asm │ │ ├── 66_DE.asm │ │ ├── 66_DF.asm │ │ ├── 66_F0.asm │ │ ├── 66_F0_2.asm │ │ ├── 66_F1.asm │ │ ├── 66_F1_2.asm │ │ ├── 66_F1_3.asm │ │ ├── F2_F0.asm │ │ ├── F2_F1.asm │ │ ├── XX_00.asm │ │ ├── XX_00_2.asm │ │ ├── XX_01.asm │ │ ├── XX_02.asm │ │ ├── XX_03.asm │ │ ├── XX_04.asm │ │ ├── XX_05.asm │ │ ├── XX_06.asm │ │ ├── XX_07.asm │ │ ├── XX_08.asm │ │ ├── XX_09.asm │ │ ├── XX_0A.asm │ │ ├── XX_0B.asm │ │ ├── XX_1C.asm │ │ ├── XX_1D.asm │ │ ├── XX_1E.asm │ │ ├── adcx.asm │ │ ├── adox.asm │ │ ├── sha1msg1.asm │ │ ├── sha1msg2.asm │ │ ├── sha1nexte.asm │ │ ├── sha256msg1.asm │ │ ├── sha256msg2.asm │ │ └── sha256rnds2.asm │ ├── H0F3A/ │ │ ├── 0_66_0F.asm │ │ ├── 0_66_21.asm │ │ ├── 0_66_DF.asm │ │ ├── 0_XX_0F.asm │ │ ├── 66_08.asm │ │ ├── 66_09.asm │ │ ├── 66_0A.asm │ │ ├── 66_0B.asm │ │ ├── 66_0C.asm │ │ ├── 66_0D.asm │ │ ├── 66_0E.asm │ │ ├── 66_14.asm │ │ ├── 66_14_2.asm │ │ ├── 66_15.asm │ │ ├── 66_16.asm │ │ ├── 66_16_1.asm │ │ ├── 66_17.asm │ │ ├── 66_20.asm │ │ ├── 66_20_1.asm │ │ ├── 66_22.asm │ │ ├── 66_22_1.asm │ │ ├── 66_22_2.asm │ │ ├── 66_40.asm │ │ ├── 66_40_2.asm │ │ ├── 66_41.asm │ │ ├── 66_41_2.asm │ │ ├── 66_42.asm │ │ ├── pclmulqdq.asm │ │ ├── pcmpestri_equal_any.asm │ │ ├── pcmpestri_equal_each.asm │ │ ├── pcmpestri_equal_ordered.asm │ │ ├── pcmpestri_ranges.asm │ │ ├── pcmpestrm_equal_any.asm │ │ ├── pcmpestrm_equal_each.asm │ │ ├── pcmpestrm_equal_ordered.asm │ │ ├── pcmpestrm_ranges.asm │ │ ├── pcmpistri_equal_any.asm │ │ ├── pcmpistri_equal_each.asm │ │ ├── pcmpistri_equal_ordered.asm │ │ ├── pcmpistri_ranges.asm │ │ ├── pcmpistrm_equal_any.asm │ │ ├── pcmpistrm_equal_each.asm │ │ ├── pcmpistrm_equal_ordered.asm │ │ ├── pcmpistrm_ranges.asm │ │ └── sha1rnds4.asm │ ├── Includes/ │ │ ├── checkprecision.mac │ │ ├── modrm_oob_macros.mac │ │ ├── x87cw.mac │ │ └── xsave_macros.mac │ ├── JMP.asm │ ├── Known_Failures │ ├── Known_Failures_host │ ├── Known_Failures_jit │ ├── MOVHPD.asm │ ├── MemoryData.asm │ ├── Multiblock/ │ │ └── ReachableInvalidCode.asm │ ├── OpSize/ │ │ ├── 15_BYTE.asm │ │ ├── 66_10.asm │ │ ├── 66_11.asm │ │ ├── 66_12.asm │ │ ├── 66_13.asm │ │ ├── 66_14.asm │ │ ├── 66_15.asm │ │ ├── 66_16.asm │ │ ├── 66_17.asm │ │ ├── 66_28.asm │ │ ├── 66_29.asm │ │ ├── 66_2A.asm │ │ ├── 66_2B.asm │ │ ├── 66_2C.asm │ │ ├── 66_2D.asm │ │ ├── 66_2E.asm │ │ ├── 66_2F.asm │ │ ├── 66_50.asm │ │ ├── 66_51.asm │ │ ├── 66_54.asm │ │ ├── 66_55.asm │ │ ├── 66_56.asm │ │ ├── 66_57.asm │ │ ├── 66_58.asm │ │ ├── 66_59.asm │ │ ├── 66_5A.asm │ │ ├── 66_5A_1.asm │ │ ├── 66_5B.asm │ │ ├── 66_5B_1.asm │ │ ├── 66_5C.asm │ │ ├── 66_5D.asm │ │ ├── 66_5E.asm │ │ ├── 66_5F.asm │ │ ├── 66_60.asm │ │ ├── 66_61.asm │ │ ├── 66_62.asm │ │ ├── 66_63.asm │ │ ├── 66_64.asm │ │ ├── 66_65.asm │ │ ├── 66_66.asm │ │ ├── 66_67.asm │ │ ├── 66_68.asm │ │ ├── 66_69.asm │ │ ├── 66_6A.asm │ │ ├── 66_6B.asm │ │ ├── 66_6C.asm │ │ ├── 66_6D.asm │ │ ├── 66_6E.asm │ │ ├── 66_6F.asm │ │ ├── 66_70.asm │ │ ├── 66_74.asm │ │ ├── 66_75.asm │ │ ├── 66_76.asm │ │ ├── 66_7C.asm │ │ ├── 66_7D.asm │ │ ├── 66_7E.asm │ │ ├── 66_7F.asm │ │ ├── 66_C2.asm │ │ ├── 66_C4.asm │ │ ├── 66_C4_2.asm │ │ ├── 66_C5.asm │ │ ├── 66_C5_2.asm │ │ ├── 66_C6.asm │ │ ├── 66_D0.asm │ │ ├── 66_D1.asm │ │ ├── 66_D2.asm │ │ ├── 66_D3.asm │ │ ├── 66_D4.asm │ │ ├── 66_D5.asm │ │ ├── 66_D6.asm │ │ ├── 66_D7.asm │ │ ├── 66_D8.asm │ │ ├── 66_D9.asm │ │ ├── 66_DA.asm │ │ ├── 66_DB.asm │ │ ├── 66_DC.asm │ │ ├── 66_DD.asm │ │ ├── 66_DE.asm │ │ ├── 66_DF.asm │ │ ├── 66_E0.asm │ │ ├── 66_E1.asm │ │ ├── 66_E2.asm │ │ ├── 66_E3.asm │ │ ├── 66_E4.asm │ │ ├── 66_E5.asm │ │ ├── 66_E6.asm │ │ ├── 66_E6_1.asm │ │ ├── 66_E7.asm │ │ ├── 66_E8.asm │ │ ├── 66_E9.asm │ │ ├── 66_EA.asm │ │ ├── 66_EB.asm │ │ ├── 66_EC.asm │ │ ├── 66_ED.asm │ │ ├── 66_EE.asm │ │ ├── 66_EF.asm │ │ ├── 66_F1.asm │ │ ├── 66_F2.asm │ │ ├── 66_F3.asm │ │ ├── 66_F4.asm │ │ ├── 66_F5.asm │ │ ├── 66_F6.asm │ │ ├── 66_F7.asm │ │ ├── 66_F8.asm │ │ ├── 66_F9.asm │ │ ├── 66_FA.asm │ │ ├── 66_FB.asm │ │ ├── 66_FC.asm │ │ ├── 66_FD.asm │ │ └── 66_FE.asm │ ├── Primary/ │ │ ├── Pause.asm │ │ ├── Primary_00.asm │ │ ├── Primary_01_Atomic16.asm │ │ ├── Primary_01_Atomic32.asm │ │ ├── Primary_01_Atomic64.asm │ │ ├── Primary_08.asm │ │ ├── Primary_09_Atomic16.asm │ │ ├── Primary_09_Atomic32.asm │ │ ├── Primary_09_Atomic64.asm │ │ ├── Primary_10.asm │ │ ├── Primary_10_2.asm │ │ ├── Primary_10_3.asm │ │ ├── Primary_10_4.asm │ │ ├── Primary_18.asm │ │ ├── Primary_18_2.asm │ │ ├── Primary_18_3.asm │ │ ├── Primary_18_4.asm │ │ ├── Primary_20.asm │ │ ├── Primary_23_Atomic16.asm │ │ ├── Primary_23_Atomic32.asm │ │ ├── Primary_23_Atomic64.asm │ │ ├── Primary_28.asm │ │ ├── Primary_29_Atomic16.asm │ │ ├── Primary_29_Atomic32.asm │ │ ├── Primary_29_Atomic64.asm │ │ ├── Primary_30.asm │ │ ├── Primary_31_Atomic16.asm │ │ ├── Primary_31_Atomic32.asm │ │ ├── Primary_31_Atomic64.asm │ │ ├── Primary_38.asm │ │ ├── Primary_39.asm │ │ ├── Primary_3A.asm │ │ ├── Primary_3B.asm │ │ ├── Primary_3C.asm │ │ ├── Primary_3D.asm │ │ ├── Primary_50.asm │ │ ├── Primary_50_2.asm │ │ ├── Primary_63.asm │ │ ├── Primary_63_2.asm │ │ ├── Primary_68.asm │ │ ├── Primary_69.asm │ │ ├── Primary_6A.asm │ │ ├── Primary_6A_2.asm │ │ ├── Primary_6B.asm │ │ ├── Primary_84.asm │ │ ├── Primary_84_2.asm │ │ ├── Primary_85.asm │ │ ├── Primary_86.asm │ │ ├── Primary_87.asm │ │ ├── Primary_87_2.asm │ │ ├── Primary_87_3.asm │ │ ├── Primary_87_Atomic16.asm │ │ ├── Primary_87_Atomic32.asm │ │ ├── Primary_87_Atomic64.asm │ │ ├── Primary_8C.asm │ │ ├── Primary_8C_2.asm │ │ ├── Primary_8D.asm │ │ ├── Primary_8D_2.asm │ │ ├── Primary_90.asm │ │ ├── Primary_90_2.asm │ │ ├── Primary_90_3.asm │ │ ├── Primary_90_4.asm │ │ ├── Primary_98.asm │ │ ├── Primary_98_2.asm │ │ ├── Primary_99.asm │ │ ├── Primary_99_2.asm │ │ ├── Primary_9B.asm │ │ ├── Primary_9C.asm │ │ ├── Primary_9D.asm │ │ ├── Primary_9E.asm │ │ ├── Primary_A0.asm │ │ ├── Primary_A2.asm │ │ ├── Primary_A4.asm │ │ ├── Primary_A4_REP.asm │ │ ├── Primary_A4_REPNE.asm │ │ ├── Primary_A4_REPNE_Down.asm │ │ ├── Primary_A4_REPNE_many.asm │ │ ├── Primary_A4_REP_Down.asm │ │ ├── Primary_A4_REP_Down_Overlapping.asm │ │ ├── Primary_A4_REP_Overlapping.asm │ │ ├── Primary_A4_REP_many.asm │ │ ├── Primary_A5.asm │ │ ├── Primary_A5_REP.asm │ │ ├── Primary_A5_REPNE.asm │ │ ├── Primary_A5_REPNE_Down.asm │ │ ├── Primary_A5_REP_Down.asm │ │ ├── Primary_A5_dword.asm │ │ ├── Primary_A5_dword_REP.asm │ │ ├── Primary_A5_dword_REPNE.asm │ │ ├── Primary_A5_dword_REPNE_Down.asm │ │ ├── Primary_A5_dword_REP_Down.asm │ │ ├── Primary_A5_qword.asm │ │ ├── Primary_A5_qword_REP.asm │ │ ├── Primary_A5_qword_REPNE.asm │ │ ├── Primary_A5_qword_REPNE_Down.asm │ │ ├── Primary_A5_qword_REP_Down.asm │ │ ├── Primary_A6.asm │ │ ├── Primary_A6_REP.asm │ │ ├── Primary_A6_REPNE.asm │ │ ├── Primary_A6_REPNE_Equal.asm │ │ ├── Primary_A6_REP_Equal.asm │ │ ├── Primary_A6_REP_Smaller.asm │ │ ├── Primary_A6_REP_addrmod.asm │ │ ├── Primary_A6_REP_down.asm │ │ ├── Primary_A6_REP_down_Equal.asm │ │ ├── Primary_A6_addrmod.asm │ │ ├── Primary_A6_down.asm │ │ ├── Primary_A7_dword.asm │ │ ├── Primary_A7_dword_down.asm │ │ ├── Primary_A7_qword.asm │ │ ├── Primary_A7_qword_down.asm │ │ ├── Primary_A7_word.asm │ │ ├── Primary_A7_word_down.asm │ │ ├── Primary_A8.asm │ │ ├── Primary_A9.asm │ │ ├── Primary_AA.asm │ │ ├── Primary_AA_REP.asm │ │ ├── Primary_AA_REPNE.asm │ │ ├── Primary_AA_REPNE_down.asm │ │ ├── Primary_AA_REP_down.asm │ │ ├── Primary_AB_dword.asm │ │ ├── Primary_AB_dword_REP.asm │ │ ├── Primary_AB_dword_REPNE.asm │ │ ├── Primary_AB_dword_REPNE_down.asm │ │ ├── Primary_AB_dword_REP_down.asm │ │ ├── Primary_AB_qword.asm │ │ ├── Primary_AB_qword_REP.asm │ │ ├── Primary_AB_qword_REPNE.asm │ │ ├── Primary_AB_qword_REPNE_down.asm │ │ ├── Primary_AB_qword_REP_down.asm │ │ ├── Primary_AB_word.asm │ │ ├── Primary_AB_word_REP.asm │ │ ├── Primary_AB_word_REPNE.asm │ │ ├── Primary_AB_word_REPNE_down.asm │ │ ├── Primary_AB_word_REP_down.asm │ │ ├── Primary_AC.asm │ │ ├── Primary_AC_REP.asm │ │ ├── Primary_AC_REPNE.asm │ │ ├── Primary_AC_REPNE_down.asm │ │ ├── Primary_AC_REP_down.asm │ │ ├── Primary_AD_REPNE_dword.asm │ │ ├── Primary_AD_REPNE_dword_down.asm │ │ ├── Primary_AD_REPNE_qword.asm │ │ ├── Primary_AD_REPNE_qword_down.asm │ │ ├── Primary_AD_REPNE_word.asm │ │ ├── Primary_AD_REPNE_word_down.asm │ │ ├── Primary_AD_REP_dword.asm │ │ ├── Primary_AD_REP_dword_down.asm │ │ ├── Primary_AD_REP_qword.asm │ │ ├── Primary_AD_REP_qword_down.asm │ │ ├── Primary_AD_REP_word.asm │ │ ├── Primary_AD_REP_word_down.asm │ │ ├── Primary_AD_dword.asm │ │ ├── Primary_AD_qword.asm │ │ ├── Primary_AD_word.asm │ │ ├── Primary_AE.asm │ │ ├── Primary_AE_REP.asm │ │ ├── Primary_AE_REPNE.asm │ │ ├── Primary_AE_REPNE_down.asm │ │ ├── Primary_AE_REP_down.asm │ │ ├── Primary_AE_addrmod.asm │ │ ├── Primary_AF_REP_dword.asm │ │ ├── Primary_AF_REP_qword.asm │ │ ├── Primary_AF_REP_word.asm │ │ ├── Primary_B0.asm │ │ ├── Primary_B8.asm │ │ ├── Primary_B8_2.asm │ │ ├── Primary_B8_3.asm │ │ ├── Primary_C2.asm │ │ ├── Primary_C3.asm │ │ ├── Primary_C8.asm │ │ ├── Primary_C8_2.asm │ │ ├── Primary_C8_o16.asm │ │ ├── Primary_C9.asm │ │ ├── Primary_C9_o16.asm │ │ ├── Primary_CF.asm │ │ ├── Primary_D7.asm │ │ ├── Primary_E0.asm │ │ ├── Primary_E1.asm │ │ ├── Primary_E2.asm │ │ ├── Primary_E3.asm │ │ ├── Primary_E8.asm │ │ ├── Primary_E9.asm │ │ ├── Primary_EB.asm │ │ ├── Primary_F5.asm │ │ ├── Primary_F8.asm │ │ ├── Primary_F9.asm │ │ ├── Primary_FC.asm │ │ ├── Primary_FD.asm │ │ ├── Primary_FF_0_Atomic16.asm │ │ ├── Primary_FF_0_Atomic32.asm │ │ ├── Primary_FF_0_Atomic64.asm │ │ ├── Primary_FF_1_Atomic16.asm │ │ ├── Primary_FF_1_Atomic32.asm │ │ ├── Primary_FF_1_Atomic64.asm │ │ ├── ROL_Flags.asm │ │ ├── ROL_OF.asm │ │ ├── ROR_Flags.asm │ │ ├── ROR_OF.asm │ │ ├── SHL.asm │ │ └── SHR.asm │ ├── PrimaryGroup/ │ │ ├── 1_80_00.asm │ │ ├── 1_80_01.asm │ │ ├── 1_80_02.asm │ │ ├── 1_80_02_2.asm │ │ ├── 1_80_03.asm │ │ ├── 1_80_03_2.asm │ │ ├── 1_80_04.asm │ │ ├── 1_80_05.asm │ │ ├── 1_80_06.asm │ │ ├── 1_80_07.asm │ │ ├── 1_81_00.asm │ │ ├── 1_81_01.asm │ │ ├── 1_81_02.asm │ │ ├── 1_81_02_2.asm │ │ ├── 1_81_03.asm │ │ ├── 1_81_03_2.asm │ │ ├── 1_81_04.asm │ │ ├── 1_81_05.asm │ │ ├── 1_81_06.asm │ │ ├── 1_81_07.asm │ │ ├── 1_83_00.asm │ │ ├── 1_83_01.asm │ │ ├── 1_83_02.asm │ │ ├── 1_83_02_2.asm │ │ ├── 1_83_03.asm │ │ ├── 1_83_03_2.asm │ │ ├── 1_83_04.asm │ │ ├── 1_83_05.asm │ │ ├── 1_83_06.asm │ │ ├── 1_83_07.asm │ │ ├── 2_C0_00.asm │ │ ├── 2_C0_01.asm │ │ ├── 2_C0_02.asm │ │ ├── 2_C0_02_2.asm │ │ ├── 2_C0_02_3.asm │ │ ├── 2_C0_02_4.asm │ │ ├── 2_C0_03.asm │ │ ├── 2_C0_03_2.asm │ │ ├── 2_C0_03_3.asm │ │ ├── 2_C0_03_4.asm │ │ ├── 2_C0_04.asm │ │ ├── 2_C0_05.asm │ │ ├── 2_C0_07.asm │ │ ├── 2_C0_07_2.asm │ │ ├── 2_C1_00.asm │ │ ├── 2_C1_01.asm │ │ ├── 2_C1_04.asm │ │ ├── 2_C1_05.asm │ │ ├── 2_C1_05_2.asm │ │ ├── 2_C1_07.asm │ │ ├── 2_D0_00.asm │ │ ├── 2_D0_01.asm │ │ ├── 2_D0_02.asm │ │ ├── 2_D0_02_2.asm │ │ ├── 2_D0_03.asm │ │ ├── 2_D0_03_2.asm │ │ ├── 2_D0_04.asm │ │ ├── 2_D0_05.asm │ │ ├── 2_D0_07.asm │ │ ├── 2_D1_00.asm │ │ ├── 2_D1_01.asm │ │ ├── 2_D1_02.asm │ │ ├── 2_D1_02_2.asm │ │ ├── 2_D1_02_3.asm │ │ ├── 2_D1_02_4.asm │ │ ├── 2_D1_02_5.asm │ │ ├── 2_D1_02_6.asm │ │ ├── 2_D1_03.asm │ │ ├── 2_D1_03_2.asm │ │ ├── 2_D1_03_3.asm │ │ ├── 2_D1_03_4.asm │ │ ├── 2_D1_03_5.asm │ │ ├── 2_D1_03_6.asm │ │ ├── 2_D1_04.asm │ │ ├── 2_D1_05.asm │ │ ├── 2_D1_07.asm │ │ ├── 2_D1_07_2.asm │ │ ├── 2_D2_02.asm │ │ ├── 2_D2_02_2.asm │ │ ├── 2_D2_02_3.asm │ │ ├── 2_D2_03.asm │ │ ├── 2_D2_03_2.asm │ │ ├── 2_D2_03_3.asm │ │ ├── 2_D3_00.asm │ │ ├── 2_D3_00_2.asm │ │ ├── 2_D3_00_3.asm │ │ ├── 2_D3_01.asm │ │ ├── 2_D3_01_2.asm │ │ ├── 2_D3_01_3.asm │ │ ├── 2_D3_02.asm │ │ ├── 2_D3_02_2.asm │ │ ├── 2_D3_02_3.asm │ │ ├── 2_D3_02_4.asm │ │ ├── 2_D3_02_5.asm │ │ ├── 2_D3_03.asm │ │ ├── 2_D3_03_2.asm │ │ ├── 2_D3_03_3.asm │ │ ├── 2_D3_03_4.asm │ │ ├── 2_D3_03_5.asm │ │ ├── 2_D3_03_6.asm │ │ ├── 2_D3_03_7.asm │ │ ├── 2_D3_03_8.asm │ │ ├── 2_D3_04.asm │ │ ├── 2_D3_05.asm │ │ ├── 2_D3_07.asm │ │ ├── 2_D3_07_2.asm │ │ ├── 3_F6_00.asm │ │ ├── 3_F6_02.asm │ │ ├── 3_F6_02_2.asm │ │ ├── 3_F6_03.asm │ │ ├── 3_F6_03_2.asm │ │ ├── 3_F6_04.asm │ │ ├── 3_F6_05.asm │ │ ├── 3_F6_05_2.asm │ │ ├── 3_F6_05_3.asm │ │ ├── 3_F6_05_4.asm │ │ ├── 3_F6_05_5.asm │ │ ├── 3_F6_06.asm │ │ ├── 3_F6_07.asm │ │ ├── 3_F6_07_2.asm │ │ ├── 3_F7_00.asm │ │ ├── 3_F7_00_2.asm │ │ ├── 3_F7_02.asm │ │ ├── 3_F7_02_2.asm │ │ ├── 3_F7_02_3.asm │ │ ├── 3_F7_03.asm │ │ ├── 3_F7_03_2.asm │ │ ├── 3_F7_04.asm │ │ ├── 3_F7_05.asm │ │ ├── 3_F7_05_2.asm │ │ ├── 3_F7_06.asm │ │ ├── 3_F7_06_2.asm │ │ ├── 3_F7_07.asm │ │ ├── 3_F7_07_2.asm │ │ ├── 4_FE_00.asm │ │ ├── 4_FE_01.asm │ │ ├── 5_FF_00.asm │ │ ├── 5_FF_00_2.asm │ │ ├── 5_FF_00_3.asm │ │ ├── 5_FF_01.asm │ │ ├── 5_FF_01_2.asm │ │ ├── 5_FF_01_3.asm │ │ ├── 5_FF_02.asm │ │ ├── 5_FF_04.asm │ │ ├── 5_FF_05.asm │ │ ├── 5_FF_05_03_o32.asm │ │ ├── 5_FF_05_03_o32_imm.asm │ │ ├── 5_FF_05_03_o64.asm │ │ ├── 5_FF_05_03_o64_imm.asm │ │ ├── 5_FF_06.asm │ │ ├── 6_C6_00.asm │ │ └── 6_C7_00.asm │ ├── REP/ │ │ ├── F3_10.asm │ │ ├── F3_10_1.asm │ │ ├── F3_11.asm │ │ ├── F3_11_1.asm │ │ ├── F3_12.asm │ │ ├── F3_16.asm │ │ ├── F3_2A.asm │ │ ├── F3_2A_1.asm │ │ ├── F3_2A_2.asm │ │ ├── F3_2B.asm │ │ ├── F3_2C.asm │ │ ├── F3_2D.asm │ │ ├── F3_51.asm │ │ ├── F3_52.asm │ │ ├── F3_52_2.asm │ │ ├── F3_53.asm │ │ ├── F3_58.asm │ │ ├── F3_59.asm │ │ ├── F3_5A.asm │ │ ├── F3_5A_1.asm │ │ ├── F3_5B.asm │ │ ├── F3_5B_1.asm │ │ ├── F3_5C.asm │ │ ├── F3_5D.asm │ │ ├── F3_5E.asm │ │ ├── F3_5F.asm │ │ ├── F3_6F.asm │ │ ├── F3_70.asm │ │ ├── F3_7E.asm │ │ ├── F3_7F.asm │ │ ├── F3_B8.asm │ │ ├── F3_BC.asm │ │ ├── F3_BD.asm │ │ ├── F3_BD_2.asm │ │ ├── F3_BD_3.asm │ │ ├── F3_BD_4.asm │ │ ├── F3_C2.asm │ │ ├── F3_D6.asm │ │ ├── F3_E6.asm │ │ └── F3_E6_1.asm │ ├── REPNE/ │ │ ├── F2_10.asm │ │ ├── F2_11.asm │ │ ├── F2_12.asm │ │ ├── F2_2A.asm │ │ ├── F2_2A_1.asm │ │ ├── F2_2B.asm │ │ ├── F2_2C.asm │ │ ├── F2_2D.asm │ │ ├── F2_2D_1.asm │ │ ├── F2_51.asm │ │ ├── F2_58.asm │ │ ├── F2_59.asm │ │ ├── F2_5A.asm │ │ ├── F2_5A_1.asm │ │ ├── F2_5C.asm │ │ ├── F2_5D.asm │ │ ├── F2_5E.asm │ │ ├── F2_5F.asm │ │ ├── F2_70.asm │ │ ├── F2_7C.asm │ │ ├── F2_7D.asm │ │ ├── F2_C2.asm │ │ ├── F2_D0.asm │ │ ├── F2_D6.asm │ │ ├── F2_E6.asm │ │ ├── F2_E6_1.asm │ │ └── F2_F0.asm │ ├── SSE4a/ │ │ ├── extrq_imm.asm │ │ ├── extrq_variable.asm │ │ ├── insertq_imm.asm │ │ └── insertq_variable.asm │ ├── STOS.asm │ ├── STOSQ.asm │ ├── STOSQ2.asm │ ├── STOSQ2_REPNE.asm │ ├── STOSQ_REPNE.asm │ ├── STOS_REPNE.asm │ ├── Secondary/ │ │ ├── 07_XX_00.asm │ │ ├── 07_XX_04.asm │ │ ├── 08_66_04.asm │ │ ├── 08_66_04_2.asm │ │ ├── 08_F2_04.asm │ │ ├── 08_F2_04_2.asm │ │ ├── 08_F2_07.asm │ │ ├── 08_F3_04.asm │ │ ├── 08_F3_04_2.asm │ │ ├── 08_XX_04.asm │ │ ├── 08_XX_04_2.asm │ │ ├── 08_XX_04_3.asm │ │ ├── 08_XX_05.asm │ │ ├── 08_XX_05_2.asm │ │ ├── 08_XX_05_3.asm │ │ ├── 08_XX_05_3_Atomic.asm │ │ ├── 08_XX_05_Atomic.asm │ │ ├── 08_XX_06.asm │ │ ├── 08_XX_06_2.asm │ │ ├── 08_XX_06_3.asm │ │ ├── 08_XX_06_3_Atomic.asm │ │ ├── 08_XX_06_Atomic.asm │ │ ├── 08_XX_07.asm │ │ ├── 08_XX_07_2.asm │ │ ├── 08_XX_07_3.asm │ │ ├── 08_XX_07_3_Atomic.asm │ │ ├── 08_XX_07_Atomic.asm │ │ ├── 09_F3_07.asm │ │ ├── 09_XX_01.asm │ │ ├── 09_XX_01_10.asm │ │ ├── 09_XX_01_11.asm │ │ ├── 09_XX_01_12.asm │ │ ├── 09_XX_01_13.asm │ │ ├── 09_XX_01_14.asm │ │ ├── 09_XX_01_15.asm │ │ ├── 09_XX_01_16.asm │ │ ├── 09_XX_01_17.asm │ │ ├── 09_XX_01_18.asm │ │ ├── 09_XX_01_19.asm │ │ ├── 09_XX_01_2.asm │ │ ├── 09_XX_01_3.asm │ │ ├── 09_XX_01_4.asm │ │ ├── 09_XX_01_5.asm │ │ ├── 09_XX_01_6.asm │ │ ├── 09_XX_01_7.asm │ │ ├── 09_XX_01_8.asm │ │ ├── 09_XX_01_9.asm │ │ ├── 09_XX_06.asm │ │ ├── 09_XX_07.asm │ │ ├── 12_66_02.asm │ │ ├── 12_66_04.asm │ │ ├── 12_66_06.asm │ │ ├── 13_66_02.asm │ │ ├── 13_66_04.asm │ │ ├── 13_66_06.asm │ │ ├── 14_66_02.asm │ │ ├── 14_66_06.asm │ │ ├── 14_66_07.asm │ │ ├── 14_XX_02.asm │ │ ├── 15_F3_00.asm │ │ ├── 15_F3_01.asm │ │ ├── 15_F3_02.asm │ │ ├── 15_F3_02_2.asm │ │ ├── 15_F3_03.asm │ │ ├── 15_F3_03_2.asm │ │ ├── 15_XX_0.asm │ │ ├── 15_XX_5.asm │ │ ├── 15_XX_6.asm │ │ ├── 15_XX_7.asm │ │ ├── 15_XX_7_2.asm │ │ ├── CLFLUSHOPT.asm │ │ ├── CLWB.asm │ │ ├── Prefetch.asm │ │ ├── shufps_optimization.asm │ │ ├── shufps_optimization_2.asm │ │ └── xsave/ │ │ ├── xsave.asm │ │ ├── xsave_avx.asm │ │ ├── xsave_avx_x87.asm │ │ ├── xsave_sse.asm │ │ └── xsave_x87.asm │ ├── SecondaryModRM/ │ │ ├── Reg_2_0.asm │ │ ├── Reg_7_1.asm │ │ ├── Reg_7_4.asm │ │ └── Reg_7_4_2.asm │ ├── SelfModifyingCode/ │ │ ├── Delinking.asm │ │ ├── DifferentBlock.asm │ │ └── SameBlock.asm │ ├── TwoByte/ │ │ ├── 0F_02.asm │ │ ├── 0F_0E.asm │ │ ├── 0F_10.asm │ │ ├── 0F_10_2.asm │ │ ├── 0F_11.asm │ │ ├── 0F_12.asm │ │ ├── 0F_13.asm │ │ ├── 0F_13_2.asm │ │ ├── 0F_14.asm │ │ ├── 0F_15.asm │ │ ├── 0F_16.asm │ │ ├── 0F_17.asm │ │ ├── 0F_19.asm │ │ ├── 0F_28.asm │ │ ├── 0F_29.asm │ │ ├── 0F_2A.asm │ │ ├── 0F_2B.asm │ │ ├── 0F_2C.asm │ │ ├── 0F_2D.asm │ │ ├── 0F_2E.asm │ │ ├── 0F_2F.asm │ │ ├── 0F_31.asm │ │ ├── 0F_40.asm │ │ ├── 0F_41.asm │ │ ├── 0F_42.asm │ │ ├── 0F_43.asm │ │ ├── 0F_44.asm │ │ ├── 0F_45.asm │ │ ├── 0F_46.asm │ │ ├── 0F_47.asm │ │ ├── 0F_48.asm │ │ ├── 0F_49.asm │ │ ├── 0F_4A.asm │ │ ├── 0F_4B.asm │ │ ├── 0F_4C.asm │ │ ├── 0F_4D.asm │ │ ├── 0F_4E.asm │ │ ├── 0F_4F.asm │ │ ├── 0F_50.asm │ │ ├── 0F_51.asm │ │ ├── 0F_52.asm │ │ ├── 0F_53.asm │ │ ├── 0F_54.asm │ │ ├── 0F_55.asm │ │ ├── 0F_56.asm │ │ ├── 0F_57.asm │ │ ├── 0F_58.asm │ │ ├── 0F_59.asm │ │ ├── 0F_5A.asm │ │ ├── 0F_5A_1.asm │ │ ├── 0F_5B.asm │ │ ├── 0F_5B_1.asm │ │ ├── 0F_5C.asm │ │ ├── 0F_5D.asm │ │ ├── 0F_5E.asm │ │ ├── 0F_5F.asm │ │ ├── 0F_60.asm │ │ ├── 0F_61.asm │ │ ├── 0F_62.asm │ │ ├── 0F_63.asm │ │ ├── 0F_64.asm │ │ ├── 0F_65.asm │ │ ├── 0F_66.asm │ │ ├── 0F_67.asm │ │ ├── 0F_68.asm │ │ ├── 0F_69.asm │ │ ├── 0F_6A.asm │ │ ├── 0F_6B.asm │ │ ├── 0F_6E.asm │ │ ├── 0F_6E_2.asm │ │ ├── 0F_6F.asm │ │ ├── 0F_70.asm │ │ ├── 0F_74.asm │ │ ├── 0F_75.asm │ │ ├── 0F_76.asm │ │ ├── 0F_77.asm │ │ ├── 0F_7E.asm │ │ ├── 0F_7F.asm │ │ ├── 0F_80.asm │ │ ├── 0F_81.asm │ │ ├── 0F_82.asm │ │ ├── 0F_83.asm │ │ ├── 0F_84.asm │ │ ├── 0F_85.asm │ │ ├── 0F_86.asm │ │ ├── 0F_87.asm │ │ ├── 0F_88.asm │ │ ├── 0F_89.asm │ │ ├── 0F_8A.asm │ │ ├── 0F_8B.asm │ │ ├── 0F_8B_16.asm │ │ ├── 0F_8B_32.asm │ │ ├── 0F_8B_64.asm │ │ ├── 0F_8C.asm │ │ ├── 0F_8D.asm │ │ ├── 0F_8E.asm │ │ ├── 0F_8F.asm │ │ ├── 0F_90.asm │ │ ├── 0F_91.asm │ │ ├── 0F_92.asm │ │ ├── 0F_93.asm │ │ ├── 0F_94.asm │ │ ├── 0F_95.asm │ │ ├── 0F_96.asm │ │ ├── 0F_97.asm │ │ ├── 0F_98.asm │ │ ├── 0F_99.asm │ │ ├── 0F_9A.asm │ │ ├── 0F_9B.asm │ │ ├── 0F_9C.asm │ │ ├── 0F_9D.asm │ │ ├── 0F_9E.asm │ │ ├── 0F_9F.asm │ │ ├── 0F_A2.asm │ │ ├── 0F_A3.asm │ │ ├── 0F_A3_2.asm │ │ ├── 0F_A4.asm │ │ ├── 0F_A4_2.asm │ │ ├── 0F_A5.asm │ │ ├── 0F_A5_2.asm │ │ ├── 0F_A5_3.asm │ │ ├── 0F_A5_4.asm │ │ ├── 0F_A5_5.asm │ │ ├── 0F_A5_6.asm │ │ ├── 0F_A5_7.asm │ │ ├── 0F_AB.asm │ │ ├── 0F_AB_2.asm │ │ ├── 0F_AB_2_Atomic.asm │ │ ├── 0F_AB_Atomic.asm │ │ ├── 0F_AC.asm │ │ ├── 0F_AC_2.asm │ │ ├── 0F_AD.asm │ │ ├── 0F_AD_2.asm │ │ ├── 0F_AD_3.asm │ │ ├── 0F_AD_4.asm │ │ ├── 0F_AD_5.asm │ │ ├── 0F_AD_6.asm │ │ ├── 0F_AD_7.asm │ │ ├── 0F_AF.asm │ │ ├── 0F_AF_2.asm │ │ ├── 0F_B0.asm │ │ ├── 0F_B0_10.asm │ │ ├── 0F_B0_11.asm │ │ ├── 0F_B0_2.asm │ │ ├── 0F_B0_3.asm │ │ ├── 0F_B0_4.asm │ │ ├── 0F_B0_5.asm │ │ ├── 0F_B0_6.asm │ │ ├── 0F_B0_7.asm │ │ ├── 0F_B0_8.asm │ │ ├── 0F_B0_9.asm │ │ ├── 0F_B3.asm │ │ ├── 0F_B3_2.asm │ │ ├── 0F_B3_2_Atomic.asm │ │ ├── 0F_B3_Atomic.asm │ │ ├── 0F_B6.asm │ │ ├── 0F_B7.asm │ │ ├── 0F_BB.asm │ │ ├── 0F_BB_2.asm │ │ ├── 0F_BB_2_Atomic.asm │ │ ├── 0F_BB_Atomic.asm │ │ ├── 0F_BC.asm │ │ ├── 0F_BD.asm │ │ ├── 0F_BE.asm │ │ ├── 0F_BF.asm │ │ ├── 0F_C0.asm │ │ ├── 0F_C0_2.asm │ │ ├── 0F_C0_Atomic16.asm │ │ ├── 0F_C0_Atomic32.asm │ │ ├── 0F_C0_Atomic64.asm │ │ ├── 0F_C2.asm │ │ ├── 0F_C3.asm │ │ ├── 0F_C4.asm │ │ ├── 0F_C4_2.asm │ │ ├── 0F_C5.asm │ │ ├── 0F_C5_2.asm │ │ ├── 0F_C6.asm │ │ ├── 0F_D1.asm │ │ ├── 0F_D2.asm │ │ ├── 0F_D3.asm │ │ ├── 0F_D4.asm │ │ ├── 0F_D5.asm │ │ ├── 0F_D7.asm │ │ ├── 0F_D8.asm │ │ ├── 0F_D9.asm │ │ ├── 0F_DA.asm │ │ ├── 0F_DB.asm │ │ ├── 0F_DC.asm │ │ ├── 0F_DD.asm │ │ ├── 0F_DE.asm │ │ ├── 0F_DF.asm │ │ ├── 0F_E0.asm │ │ ├── 0F_E1.asm │ │ ├── 0F_E2.asm │ │ ├── 0F_E3.asm │ │ ├── 0F_E4.asm │ │ ├── 0F_E5.asm │ │ ├── 0F_E7.asm │ │ ├── 0F_E8.asm │ │ ├── 0F_E9.asm │ │ ├── 0F_EA.asm │ │ ├── 0F_EB.asm │ │ ├── 0F_EC.asm │ │ ├── 0F_ED.asm │ │ ├── 0F_EE.asm │ │ ├── 0F_EF.asm │ │ ├── 0F_F1.asm │ │ ├── 0F_F2.asm │ │ ├── 0F_F3.asm │ │ ├── 0F_F4.asm │ │ ├── 0F_F5.asm │ │ ├── 0F_F6.asm │ │ ├── 0F_F6_2.asm │ │ ├── 0F_F7.asm │ │ ├── 0F_F8.asm │ │ ├── 0F_F9.asm │ │ ├── 0F_FA.asm │ │ ├── 0F_FB.asm │ │ ├── 0F_FC.asm │ │ ├── 0F_FD.asm │ │ └── 0F_FE.asm │ ├── VEX/ │ │ ├── andn.asm │ │ ├── bextr.asm │ │ ├── blsi.asm │ │ ├── blsmsk.asm │ │ ├── blsr.asm │ │ ├── bzhi.asm │ │ ├── fma_fmadd_pd.asm │ │ ├── fma_fmadd_ps.asm │ │ ├── fma_fmadd_sd.asm │ │ ├── fma_fmadd_ss.asm │ │ ├── fma_fmaddsub_pd.asm │ │ ├── fma_fmaddsub_ps.asm │ │ ├── fma_fmsub_pd.asm │ │ ├── fma_fmsub_ps.asm │ │ ├── fma_fmsub_sd.asm │ │ ├── fma_fmsub_ss.asm │ │ ├── fma_fmsubadd_pd.asm │ │ ├── fma_fmsubadd_ps.asm │ │ ├── fma_fnmadd_pd.asm │ │ ├── fma_fnmadd_ps.asm │ │ ├── fma_fnmadd_sd.asm │ │ ├── fma_fnmadd_ss.asm │ │ ├── fma_fnmsub_pd.asm │ │ ├── fma_fnmsub_ps.asm │ │ ├── fma_fnmsub_sd.asm │ │ ├── fma_fnmsub_ss.asm │ │ ├── full_vpermq_imm.asm │ │ ├── mulx.asm │ │ ├── pdep.asm │ │ ├── pext.asm │ │ ├── rorx.asm │ │ ├── sarx.asm │ │ ├── shlx.asm │ │ ├── shrx.asm │ │ ├── vaddpd.asm │ │ ├── vaddps.asm │ │ ├── vaddsd.asm │ │ ├── vaddss.asm │ │ ├── vaddsubpd.asm │ │ ├── vaddsubps.asm │ │ ├── vaesdec.asm │ │ ├── vaesdec256.asm │ │ ├── vaesdeclast.asm │ │ ├── vaesdeclast256.asm │ │ ├── vaesenc.asm │ │ ├── vaesenc256.asm │ │ ├── vaesenclast.asm │ │ ├── vaesenclast256.asm │ │ ├── vaesimc.asm │ │ ├── vaeskeygenassist.asm │ │ ├── vandnpd.asm │ │ ├── vandnps.asm │ │ ├── vandpd.asm │ │ ├── vandps.asm │ │ ├── vblendpd.asm │ │ ├── vblendps.asm │ │ ├── vblendvpd.asm │ │ ├── vblendvps.asm │ │ ├── vbroadcastf128.asm │ │ ├── vbroadcasti128.asm │ │ ├── vbroadcastsd.asm │ │ ├── vbroadcastss.asm │ │ ├── vcmppd.asm │ │ ├── vcmppd_256.asm │ │ ├── vcmppd_full.asm │ │ ├── vcmpps.asm │ │ ├── vcmpps_256.asm │ │ ├── vcmpps_full.asm │ │ ├── vcmpsd.asm │ │ ├── vcmpsd_full.asm │ │ ├── vcmpss.asm │ │ ├── vcmpss_full.asm │ │ ├── vcomisd.asm │ │ ├── vcomiss.asm │ │ ├── vcvtdq2pd.asm │ │ ├── vcvtdq2ps.asm │ │ ├── vcvtpd2dq.asm │ │ ├── vcvtpd2dq_inexact.asm │ │ ├── vcvtpd2ps.asm │ │ ├── vcvtph2ps.asm │ │ ├── vcvtps2dq.asm │ │ ├── vcvtps2dq_inexact.asm │ │ ├── vcvtps2pd.asm │ │ ├── vcvtps2ph_rd.asm │ │ ├── vcvtps2ph_rd_mxcsr.asm │ │ ├── vcvtps2ph_rtne.asm │ │ ├── vcvtps2ph_rtne_mxcsr.asm │ │ ├── vcvtps2ph_ru.asm │ │ ├── vcvtps2ph_ru_mxcsr.asm │ │ ├── vcvtps2ph_trunc.asm │ │ ├── vcvtps2ph_trunc_mxcsr.asm │ │ ├── vcvtsd2si.asm │ │ ├── vcvtsd2ss.asm │ │ ├── vcvtsi2sd.asm │ │ ├── vcvtsi2ss.asm │ │ ├── vcvtss2sd.asm │ │ ├── vcvtss2si.asm │ │ ├── vcvttpd2dq.asm │ │ ├── vcvttps2dq.asm │ │ ├── vcvttsd2si.asm │ │ ├── vcvttss2si.asm │ │ ├── vdivpd.asm │ │ ├── vdivps.asm │ │ ├── vdivsd.asm │ │ ├── vdivss.asm │ │ ├── vdppd.asm │ │ ├── vdpps_128.asm │ │ ├── vdpps_256.asm │ │ ├── vextractf128.asm │ │ ├── vextracti128.asm │ │ ├── vextractps.asm │ │ ├── vgather_dpd_128bit_1xdisp.asm │ │ ├── vgather_dpd_128bit_2xdisp.asm │ │ ├── vgather_dpd_128bit_4xdisp.asm │ │ ├── vgather_dpd_128bit_8xdisp.asm │ │ ├── vgather_dpd_256bit_1xdisp.asm │ │ ├── vgather_dpd_256bit_2xdisp.asm │ │ ├── vgather_dpd_256bit_4xdisp.asm │ │ ├── vgather_dpd_256bit_8xdisp.asm │ │ ├── vgather_dps_128bit_1xdisp.asm │ │ ├── vgather_dps_128bit_2xdisp.asm │ │ ├── vgather_dps_128bit_4xdisp.asm │ │ ├── vgather_dps_128bit_8xdisp.asm │ │ ├── vgather_dps_256bit_1xdisp.asm │ │ ├── vgather_dps_256bit_2xdisp.asm │ │ ├── vgather_dps_256bit_4xdisp.asm │ │ ├── vgather_dps_256bit_8xdisp.asm │ │ ├── vgather_qpd_128bit_1xdisp.asm │ │ ├── vgather_qpd_128bit_1xdisp_overflow.asm │ │ ├── vgather_qpd_128bit_2xdisp.asm │ │ ├── vgather_qpd_128bit_2xdisp_overflow.asm │ │ ├── vgather_qpd_128bit_4xdisp.asm │ │ ├── vgather_qpd_128bit_4xdisp_overflow.asm │ │ ├── vgather_qpd_128bit_8xdisp.asm │ │ ├── vgather_qpd_128bit_8xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_1xdisp.asm │ │ ├── vgather_qpd_256bit_1xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_2xdisp.asm │ │ ├── vgather_qpd_256bit_2xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_4xdisp.asm │ │ ├── vgather_qpd_256bit_4xdisp_overflow.asm │ │ ├── vgather_qpd_256bit_8xdisp.asm │ │ ├── vgather_qpd_256bit_8xdisp_overflow.asm │ │ ├── vgather_qps_128bit_1xdisp.asm │ │ ├── vgather_qps_128bit_1xdisp_overflow.asm │ │ ├── vgather_qps_128bit_2xdisp.asm │ │ ├── vgather_qps_128bit_2xdisp_overflow.asm │ │ ├── vgather_qps_128bit_4xdisp.asm │ │ ├── vgather_qps_128bit_4xdisp_overflow.asm │ │ ├── vgather_qps_128bit_8xdisp.asm │ │ ├── vgather_qps_128bit_8xdisp_overflow.asm │ │ ├── vgather_qps_256bit_1xdisp.asm │ │ ├── vgather_qps_256bit_1xdisp_overflow.asm │ │ ├── vgather_qps_256bit_2xdisp.asm │ │ ├── vgather_qps_256bit_2xdisp_overflow.asm │ │ ├── vgather_qps_256bit_4xdisp.asm │ │ ├── vgather_qps_256bit_4xdisp_overflow.asm │ │ ├── vgather_qps_256bit_8xdisp.asm │ │ ├── vgather_qps_256bit_8xdisp_overflow.asm │ │ ├── vhaddpd.asm │ │ ├── vhaddps.asm │ │ ├── vhsubpd.asm │ │ ├── vhsubps.asm │ │ ├── vinsertf128.asm │ │ ├── vinserti128.asm │ │ ├── vinsertps.asm │ │ ├── vlddqu.asm │ │ ├── vldmxcsr.asm │ │ ├── vmaskmovdqu.asm │ │ ├── vmaskmovpd_load.asm │ │ ├── vmaskmovpd_store.asm │ │ ├── vmaskmovps_load.asm │ │ ├── vmaskmovps_store.asm │ │ ├── vmaxpd.asm │ │ ├── vmaxps.asm │ │ ├── vmaxsd.asm │ │ ├── vmaxss.asm │ │ ├── vminpd.asm │ │ ├── vminps.asm │ │ ├── vminsd.asm │ │ ├── vminss.asm │ │ ├── vmovapd.asm │ │ ├── vmovapd_mem.asm │ │ ├── vmovaps.asm │ │ ├── vmovaps_mem.asm │ │ ├── vmovddup.asm │ │ ├── vmovdqa.asm │ │ ├── vmovdqu.asm │ │ ├── vmovhlps.asm │ │ ├── vmovhpd.asm │ │ ├── vmovhps.asm │ │ ├── vmovlhps.asm │ │ ├── vmovlpd.asm │ │ ├── vmovlps.asm │ │ ├── vmovmskpd.asm │ │ ├── vmovmskps.asm │ │ ├── vmovntdq.asm │ │ ├── vmovntdqa.asm │ │ ├── vmovntpd.asm │ │ ├── vmovntps.asm │ │ ├── vmovq.asm │ │ ├── vmovq_vmovd_reg.asm │ │ ├── vmovsd_from_mem.asm │ │ ├── vmovsd_to_mem.asm │ │ ├── vmovsd_vectors.asm │ │ ├── vmovshdup.asm │ │ ├── vmovsldup.asm │ │ ├── vmovss_from_mem.asm │ │ ├── vmovss_to_mem.asm │ │ ├── vmovss_vectors.asm │ │ ├── vmovupd.asm │ │ ├── vmovupd_mem.asm │ │ ├── vmovups.asm │ │ ├── vmovups_mem.asm │ │ ├── vmpsadbw_128.asm │ │ ├── vmpsadbw_256.asm │ │ ├── vmulpd.asm │ │ ├── vmulps.asm │ │ ├── vmulsd.asm │ │ ├── vmulss.asm │ │ ├── vorpd.asm │ │ ├── vorps.asm │ │ ├── vpabsb.asm │ │ ├── vpabsd.asm │ │ ├── vpabsw.asm │ │ ├── vpackssdw.asm │ │ ├── vpacksswb.asm │ │ ├── vpackusdw.asm │ │ ├── vpackuswb.asm │ │ ├── vpaddb.asm │ │ ├── vpaddd.asm │ │ ├── vpaddq.asm │ │ ├── vpaddsb.asm │ │ ├── vpaddsw.asm │ │ ├── vpaddusb.asm │ │ ├── vpaddusw.asm │ │ ├── vpaddw.asm │ │ ├── vpalignr.asm │ │ ├── vpand.asm │ │ ├── vpandn.asm │ │ ├── vpavgb.asm │ │ ├── vpavgb_aliasing.asm │ │ ├── vpavgw.asm │ │ ├── vpavgw_aliasing.asm │ │ ├── vpblendd.asm │ │ ├── vpblendvb.asm │ │ ├── vpblendw.asm │ │ ├── vpbroadcastb.asm │ │ ├── vpbroadcastd.asm │ │ ├── vpbroadcastq.asm │ │ ├── vpbroadcastw.asm │ │ ├── vpclmulqdq.asm │ │ ├── vpclmulqdq_256.asm │ │ ├── vpcmpeqb.asm │ │ ├── vpcmpeqd.asm │ │ ├── vpcmpeqq.asm │ │ ├── vpcmpeqw.asm │ │ ├── vpcmpestri_equal_any.asm │ │ ├── vpcmpestri_equal_each.asm │ │ ├── vpcmpestri_equal_ordered.asm │ │ ├── vpcmpestri_ranges.asm │ │ ├── vpcmpestrm_equal_any.asm │ │ ├── vpcmpestrm_equal_each.asm │ │ ├── vpcmpestrm_equal_ordered.asm │ │ ├── vpcmpestrm_ranges.asm │ │ ├── vpcmpgtb.asm │ │ ├── vpcmpgtd.asm │ │ ├── vpcmpgtq.asm │ │ ├── vpcmpgtw.asm │ │ ├── vpcmpistri_equal_any.asm │ │ ├── vpcmpistri_equal_each.asm │ │ ├── vpcmpistri_equal_ordered.asm │ │ ├── vpcmpistri_ranges.asm │ │ ├── vpcmpistrm_equal_any.asm │ │ ├── vpcmpistrm_equal_each.asm │ │ ├── vpcmpistrm_equal_ordered.asm │ │ ├── vpcmpistrm_ranges.asm │ │ ├── vperm2f128.asm │ │ ├── vperm2i128.asm │ │ ├── vpermd.asm │ │ ├── vpermilpd_imm.asm │ │ ├── vpermilpd_reg.asm │ │ ├── vpermilps_imm.asm │ │ ├── vpermilps_reg.asm │ │ ├── vpermpd.asm │ │ ├── vpermps.asm │ │ ├── vpermq.asm │ │ ├── vpextrb.asm │ │ ├── vpextrd.asm │ │ ├── vpextrq.asm │ │ ├── vpextrw.asm │ │ ├── vpgather_dd_128bit_1xdisp.asm │ │ ├── vpgather_dd_128bit_2xdisp.asm │ │ ├── vpgather_dd_128bit_4xdisp.asm │ │ ├── vpgather_dd_128bit_8xdisp.asm │ │ ├── vpgather_dd_256bit_1xdisp.asm │ │ ├── vpgather_dd_256bit_2xdisp.asm │ │ ├── vpgather_dd_256bit_4xdisp.asm │ │ ├── vpgather_dd_256bit_8xdisp.asm │ │ ├── vpgather_dq_128bit_1xdisp.asm │ │ ├── vpgather_dq_128bit_2xdisp.asm │ │ ├── vpgather_dq_128bit_4xdisp.asm │ │ ├── vpgather_dq_128bit_8xdisp.asm │ │ ├── vpgather_dq_256bit_1xdisp.asm │ │ ├── vpgather_dq_256bit_2xdisp.asm │ │ ├── vpgather_dq_256bit_4xdisp.asm │ │ ├── vpgather_dq_256bit_8xdisp.asm │ │ ├── vpgather_qd_128bit_1xdisp.asm │ │ ├── vpgather_qd_128bit_1xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_2xdisp.asm │ │ ├── vpgather_qd_128bit_2xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_4xdisp.asm │ │ ├── vpgather_qd_128bit_4xdisp_overflow.asm │ │ ├── vpgather_qd_128bit_8xdisp.asm │ │ ├── vpgather_qd_128bit_8xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_1xdisp.asm │ │ ├── vpgather_qd_256bit_1xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_2xdisp.asm │ │ ├── vpgather_qd_256bit_2xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_4xdisp.asm │ │ ├── vpgather_qd_256bit_4xdisp_overflow.asm │ │ ├── vpgather_qd_256bit_8xdisp.asm │ │ ├── vpgather_qd_256bit_8xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_1xdisp.asm │ │ ├── vpgather_qq_128bit_1xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_2xdisp.asm │ │ ├── vpgather_qq_128bit_2xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_4xdisp.asm │ │ ├── vpgather_qq_128bit_4xdisp_overflow.asm │ │ ├── vpgather_qq_128bit_8xdisp.asm │ │ ├── vpgather_qq_128bit_8xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_1xdisp.asm │ │ ├── vpgather_qq_256bit_1xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_2xdisp.asm │ │ ├── vpgather_qq_256bit_2xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_4xdisp.asm │ │ ├── vpgather_qq_256bit_4xdisp_overflow.asm │ │ ├── vpgather_qq_256bit_8xdisp.asm │ │ ├── vpgather_qq_256bit_8xdisp_overflow.asm │ │ ├── vphaddd.asm │ │ ├── vphaddsw.asm │ │ ├── vphaddsw_256.asm │ │ ├── vphaddw.asm │ │ ├── vphminposuw.asm │ │ ├── vphsubd.asm │ │ ├── vphsubsw.asm │ │ ├── vphsubsw_256.asm │ │ ├── vphsubw.asm │ │ ├── vpinsrb.asm │ │ ├── vpinsrd.asm │ │ ├── vpinsrq.asm │ │ ├── vpinsrw.asm │ │ ├── vpmaddubsw.asm │ │ ├── vpmaddubsw_256.asm │ │ ├── vpmaddwd.asm │ │ ├── vpmaskmovd_load.asm │ │ ├── vpmaskmovd_store.asm │ │ ├── vpmaskmovq_load.asm │ │ ├── vpmaskmovq_store.asm │ │ ├── vpmaxsb.asm │ │ ├── vpmaxsd.asm │ │ ├── vpmaxsw.asm │ │ ├── vpmaxub.asm │ │ ├── vpmaxud.asm │ │ ├── vpmaxuw.asm │ │ ├── vpminsb.asm │ │ ├── vpminsd.asm │ │ ├── vpminsw.asm │ │ ├── vpminub.asm │ │ ├── vpminud.asm │ │ ├── vpminuw.asm │ │ ├── vpmovmskb.asm │ │ ├── vpmovsxbd.asm │ │ ├── vpmovsxbq.asm │ │ ├── vpmovsxbw.asm │ │ ├── vpmovsxdq.asm │ │ ├── vpmovsxwd.asm │ │ ├── vpmovsxwq.asm │ │ ├── vpmovzxbd.asm │ │ ├── vpmovzxbq.asm │ │ ├── vpmovzxbw.asm │ │ ├── vpmovzxdq.asm │ │ ├── vpmovzxwd.asm │ │ ├── vpmovzxwq.asm │ │ ├── vpmuldq.asm │ │ ├── vpmuldq_256.asm │ │ ├── vpmulhrsw.asm │ │ ├── vpmulhuw.asm │ │ ├── vpmulhw.asm │ │ ├── vpmulld.asm │ │ ├── vpmullw.asm │ │ ├── vpmuludq.asm │ │ ├── vpor.asm │ │ ├── vpsadbw.asm │ │ ├── vpsadbw_256.asm │ │ ├── vpshufb.asm │ │ ├── vpshufd.asm │ │ ├── vpshufhw.asm │ │ ├── vpshuflw.asm │ │ ├── vpsignb.asm │ │ ├── vpsignd.asm │ │ ├── vpsignw.asm │ │ ├── vpslld.asm │ │ ├── vpslld_imm.asm │ │ ├── vpslldq.asm │ │ ├── vpsllq.asm │ │ ├── vpsllq_imm.asm │ │ ├── vpsllvd.asm │ │ ├── vpsllvq.asm │ │ ├── vpsllw.asm │ │ ├── vpsllw_imm.asm │ │ ├── vpsrad.asm │ │ ├── vpsrad_imm.asm │ │ ├── vpsravd.asm │ │ ├── vpsraw.asm │ │ ├── vpsraw_imm.asm │ │ ├── vpsrld.asm │ │ ├── vpsrld_imm.asm │ │ ├── vpsrldq.asm │ │ ├── vpsrlq.asm │ │ ├── vpsrlq_imm.asm │ │ ├── vpsrlvd.asm │ │ ├── vpsrlvq.asm │ │ ├── vpsrlw.asm │ │ ├── vpsrlw_imm.asm │ │ ├── vpsubb.asm │ │ ├── vpsubd.asm │ │ ├── vpsubq.asm │ │ ├── vpsubsb.asm │ │ ├── vpsubsw.asm │ │ ├── vpsubusb.asm │ │ ├── vpsubusw.asm │ │ ├── vpsubw.asm │ │ ├── vptest.asm │ │ ├── vpunpckhbw.asm │ │ ├── vpunpckhdq.asm │ │ ├── vpunpckhqdq.asm │ │ ├── vpunpckhwd.asm │ │ ├── vpunpcklbw.asm │ │ ├── vpunpckldq.asm │ │ ├── vpunpcklqdq.asm │ │ ├── vpunpcklwd.asm │ │ ├── vpxor.asm │ │ ├── vrcpps.asm │ │ ├── vrcpss.asm │ │ ├── vroundpd.asm │ │ ├── vroundps.asm │ │ ├── vroundsd.asm │ │ ├── vroundss.asm │ │ ├── vrsqrtps.asm │ │ ├── vrsqrtss.asm │ │ ├── vshufpd.asm │ │ ├── vshufps.asm │ │ ├── vsqrtpd.asm │ │ ├── vsqrtps.asm │ │ ├── vsqrtsd.asm │ │ ├── vsqrtss.asm │ │ ├── vsubpd.asm │ │ ├── vsubps.asm │ │ ├── vsubsd.asm │ │ ├── vsubss.asm │ │ ├── vtestpd.asm │ │ ├── vtestps.asm │ │ ├── vucomisd.asm │ │ ├── vucomiss.asm │ │ ├── vunpckhpd.asm │ │ ├── vunpckhps.asm │ │ ├── vunpcklpd.asm │ │ ├── vunpcklps.asm │ │ ├── vxorpd.asm │ │ ├── vxorps.asm │ │ ├── vzeroall.asm │ │ └── vzeroupper.asm │ ├── X87/ │ │ ├── D8_00.asm │ │ ├── D8_01.asm │ │ ├── D8_04.asm │ │ ├── D8_05.asm │ │ ├── D8_06.asm │ │ ├── D8_07.asm │ │ ├── D8_C0.asm │ │ ├── D8_C8.asm │ │ ├── D8_D0.asm │ │ ├── D8_D9.asm │ │ ├── D8_E0.asm │ │ ├── D8_E8.asm │ │ ├── D8_F0.asm │ │ ├── D8_F0_2.asm │ │ ├── D8_F8.asm │ │ ├── D9_00.asm │ │ ├── D9_02.asm │ │ ├── D9_03.asm │ │ ├── D9_05.asm │ │ ├── D9_06.asm │ │ ├── D9_06_2.asm │ │ ├── D9_07.asm │ │ ├── D9_C0.asm │ │ ├── D9_C8.asm │ │ ├── D9_D0.asm │ │ ├── D9_E0.asm │ │ ├── D9_E1.asm │ │ ├── D9_E4.asm │ │ ├── D9_E8.asm │ │ ├── D9_E9.asm │ │ ├── D9_EA.asm │ │ ├── D9_EB.asm │ │ ├── D9_EC.asm │ │ ├── D9_ED.asm │ │ ├── D9_EE.asm │ │ ├── D9_F0.asm │ │ ├── D9_F1.asm │ │ ├── D9_F2.asm │ │ ├── D9_F3.asm │ │ ├── D9_F4.asm │ │ ├── D9_F4_02.asm │ │ ├── D9_F5.asm │ │ ├── D9_F5_2.asm │ │ ├── D9_F5_3.asm │ │ ├── D9_F6.asm │ │ ├── D9_F7.asm │ │ ├── D9_F8.asm │ │ ├── D9_F9.asm │ │ ├── D9_FA.asm │ │ ├── D9_FB.asm │ │ ├── D9_FC.asm │ │ ├── D9_FD.asm │ │ ├── D9_FD_2.asm │ │ ├── D9_FE.asm │ │ ├── D9_FF.asm │ │ ├── DA_00.asm │ │ ├── DA_01.asm │ │ ├── DA_02.asm │ │ ├── DA_04.asm │ │ ├── DA_05.asm │ │ ├── DA_06.asm │ │ ├── DA_07.asm │ │ ├── DA_C0.asm │ │ ├── DA_C8.asm │ │ ├── DA_D0.asm │ │ ├── DA_D8.asm │ │ ├── DA_D9.asm │ │ ├── DA_E9.asm │ │ ├── DB_00.asm │ │ ├── DB_01.asm │ │ ├── DB_02.asm │ │ ├── DB_03.asm │ │ ├── DB_05.asm │ │ ├── DB_07.asm │ │ ├── DB_07_2.asm │ │ ├── DB_C0.asm │ │ ├── DB_C8.asm │ │ ├── DB_D0.asm │ │ ├── DB_D8.asm │ │ ├── DB_E2.asm │ │ ├── DB_E3.asm │ │ ├── DB_E3_2.asm │ │ ├── DB_E8.asm │ │ ├── DB_F0.asm │ │ ├── DC_00.asm │ │ ├── DC_01.asm │ │ ├── DC_04.asm │ │ ├── DC_05.asm │ │ ├── DC_06.asm │ │ ├── DC_07.asm │ │ ├── DC_C0.asm │ │ ├── DC_C8.asm │ │ ├── DC_D0.asm │ │ ├── DC_D9.asm │ │ ├── DC_E0.asm │ │ ├── DC_E8.asm │ │ ├── DC_F0.asm │ │ ├── DC_F8.asm │ │ ├── DD_00.asm │ │ ├── DD_01.asm │ │ ├── DD_02.asm │ │ ├── DD_03.asm │ │ ├── DD_04.asm │ │ ├── DD_04_2.asm │ │ ├── DD_07.asm │ │ ├── DD_C0.asm │ │ ├── DD_C8.asm │ │ ├── DD_D0.asm │ │ ├── DD_D0_2.asm │ │ ├── DD_D8.asm │ │ ├── DD_E9.asm │ │ ├── DE_00.asm │ │ ├── DE_01.asm │ │ ├── DE_02.asm │ │ ├── DE_04.asm │ │ ├── DE_05.asm │ │ ├── DE_06.asm │ │ ├── DE_07.asm │ │ ├── DE_C0.asm │ │ ├── DE_C8.asm │ │ ├── DE_D0.asm │ │ ├── DE_E0.asm │ │ ├── DE_E8.asm │ │ ├── DE_F0.asm │ │ ├── DE_F8.asm │ │ ├── DF_00.asm │ │ ├── DF_01.asm │ │ ├── DF_02.asm │ │ ├── DF_03.asm │ │ ├── DF_04.asm │ │ ├── DF_05.asm │ │ ├── DF_07.asm │ │ ├── DF_C0.asm │ │ ├── DF_C8.asm │ │ ├── DF_D0.asm │ │ ├── DF_D8.asm │ │ ├── DF_E0.asm │ │ ├── DF_E8.asm │ │ ├── DF_F0.asm │ │ ├── FISTTP_16bit.asm │ │ ├── FISTTP_16bit_neg.asm │ │ ├── FISTTP_32bit.asm │ │ ├── FISTTP_32bit_neg.asm │ │ ├── FISTTP_64bit.asm │ │ ├── FISTTP_64bit_neg.asm │ │ ├── FPREM1_Flags.asm │ │ ├── FPREM_Flags.asm │ │ ├── FST_AddrModes.asm │ │ ├── FScale-Zero.asm │ │ ├── FScaleFXtract.asm │ │ ├── FXAM_Push.asm │ │ ├── FXAM_Push_2.asm │ │ ├── FXAM_Push_Simple.asm │ │ ├── FXAM_Push_Simple_2.asm │ │ ├── FXAM_Simple.asm │ │ ├── LoadAtBoundary.asm │ │ ├── Memcopy.asm │ │ ├── MemcopyWithCPUID.asm │ │ ├── Rounding.asm │ │ ├── StoreAtBoundary.asm │ │ ├── X87MMXInteraction.asm │ │ ├── invalid_div_zero.asm │ │ ├── invalid_fcos_infinity.asm │ │ ├── invalid_fist_nan.asm │ │ ├── invalid_fist_overflow.asm │ │ ├── invalid_fist_overflow_16bit.asm │ │ ├── invalid_fist_overflow_32bit.asm │ │ ├── invalid_fist_overflow_64bit.asm │ │ ├── invalid_fprem_infinity.asm │ │ ├── invalid_fptan_infinity.asm │ │ ├── invalid_fsin_infinity.asm │ │ ├── invalid_fsin_neg_infinity.asm │ │ ├── invalid_fsincos_infinity.asm │ │ ├── invalid_infinity_fsubr_infinity.asm │ │ ├── invalid_infinity_mul_zero.asm │ │ ├── invalid_infinity_ops.asm │ │ ├── invalid_infinity_sub_infinity.asm │ │ ├── invalid_neg_infinity_sub_neg_infinity.asm │ │ ├── invalid_reduced_precision.asm │ │ ├── invalid_simple_test.asm │ │ ├── invalid_sqrt_negative.asm │ │ ├── precision_test_fabs.asm │ │ ├── precision_test_fadd.asm │ │ ├── precision_test_fcos.asm │ │ ├── precision_test_fdiv.asm │ │ ├── precision_test_fdivr.asm │ │ ├── precision_test_fmul.asm │ │ ├── precision_test_fprem.asm │ │ ├── precision_test_fprem1.asm │ │ ├── precision_test_fscale.asm │ │ ├── precision_test_fsin.asm │ │ ├── precision_test_fsqrt.asm │ │ ├── precision_test_fsub.asm │ │ ├── precision_test_fsubr.asm │ │ ├── precision_test_ftan.asm │ │ ├── precision_test_fyl2x.asm │ │ ├── precision_test_fyl2xp1.asm │ │ ├── precision_test_neg_fabs.asm │ │ ├── precision_test_neg_fadd.asm │ │ ├── precision_test_neg_fcos.asm │ │ ├── precision_test_neg_fdiv.asm │ │ ├── precision_test_neg_fdivr.asm │ │ ├── precision_test_neg_fmul.asm │ │ ├── precision_test_neg_fprem.asm │ │ ├── precision_test_neg_fprem1.asm │ │ ├── precision_test_neg_fscale.asm │ │ ├── precision_test_neg_fsin.asm │ │ ├── precision_test_neg_fsub.asm │ │ ├── precision_test_neg_fsubr.asm │ │ ├── precision_test_neg_ftan.asm │ │ ├── precision_test_neg_fyl2x.asm │ │ ├── precision_test_neg_fyl2xp1.asm │ │ ├── valid_fist_16bit.asm │ │ └── valid_operation.asm │ ├── X87_F64/ │ │ ├── D8_00_F64.asm │ │ ├── D8_01_F64.asm │ │ ├── D8_04_F64.asm │ │ ├── D8_05_F64.asm │ │ ├── D8_06_F64.asm │ │ ├── D8_07_F64.asm │ │ ├── D8_C0_F64.asm │ │ ├── D8_C8_F64.asm │ │ ├── D8_D9_F64.asm │ │ ├── D8_E0_F64.asm │ │ ├── D8_E8_F64.asm │ │ ├── D8_F0_2_F64.asm │ │ ├── D8_F0_F64.asm │ │ ├── D8_F8_F64.asm │ │ ├── D9_00_F64.asm │ │ ├── D9_02_F64.asm │ │ ├── D9_03_F64.asm │ │ ├── D9_05_F64.asm │ │ ├── D9_06_2_F64.asm │ │ ├── D9_06_F64.asm │ │ ├── D9_07_F64.asm │ │ ├── D9_C0_F64.asm │ │ ├── D9_C8_F64.asm │ │ ├── D9_D0_F64.asm │ │ ├── D9_E0_F64.asm │ │ ├── D9_E1_F64.asm │ │ ├── D9_E4_F64.asm │ │ ├── D9_E8_F64.asm │ │ ├── D9_E9_F64.asm │ │ ├── D9_EA_F64.asm │ │ ├── D9_EB_F64.asm │ │ ├── D9_EC_F64.asm │ │ ├── D9_ED_F64.asm │ │ ├── D9_EE_F64.asm │ │ ├── D9_F0_F64.asm │ │ ├── D9_F1_F64.asm │ │ ├── D9_F2_F64.asm │ │ ├── D9_F3_F64.asm │ │ ├── D9_F4_02_F64.asm │ │ ├── D9_F4_F64.asm │ │ ├── D9_F5_F64.asm │ │ ├── D9_F6_F64.asm │ │ ├── D9_F7_F64.asm │ │ ├── D9_F8_F64.asm │ │ ├── D9_F9_F64.asm │ │ ├── D9_FA_F64.asm │ │ ├── D9_FB_F64.asm │ │ ├── D9_FC_F64.asm │ │ ├── D9_FD_2_F64.asm │ │ ├── D9_FD_F64.asm │ │ ├── D9_FE_F64.asm │ │ ├── D9_FF_F64.asm │ │ ├── DA_01_F64.asm │ │ ├── DA_02_F64.asm │ │ ├── DA_04_F64.asm │ │ ├── DA_05_F64.asm │ │ ├── DA_06_F64.asm │ │ ├── DA_07_F64.asm │ │ ├── DA_C0_F64.asm │ │ ├── DA_C8_F64.asm │ │ ├── DA_D0_F64.asm │ │ ├── DA_D8_F64.asm │ │ ├── DA_D9_F64.asm │ │ ├── DA_E9_F64.asm │ │ ├── DB_00_F64.asm │ │ ├── DB_01_F64.asm │ │ ├── DB_02_F64.asm │ │ ├── DB_03_F64.asm │ │ ├── DB_05_F64.asm │ │ ├── DB_07_F64.asm │ │ ├── DB_C0_F64.asm │ │ ├── DB_C8_F64.asm │ │ ├── DB_D0_F64.asm │ │ ├── DB_D8_F64.asm │ │ ├── DB_E3.asm │ │ ├── DC_00_F64.asm │ │ ├── DC_01_F64.asm │ │ ├── DC_04_F64.asm │ │ ├── DC_05_F64.asm │ │ ├── DC_06_F64.asm │ │ ├── DC_07_F64.asm │ │ ├── DC_C0_F64.asm │ │ ├── DC_C8_F64.asm │ │ ├── DC_E0_F64.asm │ │ ├── DC_E8_F64.asm │ │ ├── DC_F0_F64.asm │ │ ├── DC_F8_F64.asm │ │ ├── DD_00_F64.asm │ │ ├── DD_01_F64.asm │ │ ├── DD_02_F64.asm │ │ ├── DD_03_F64.asm │ │ ├── DD_04_2_F64.asm │ │ ├── DD_04_F64.asm │ │ ├── DD_07_F64.asm │ │ ├── DD_C0_F64.asm │ │ ├── DD_D0_2_F64.asm │ │ ├── DD_D0_F64.asm │ │ ├── DD_D8_F64.asm │ │ ├── DD_E9_F64.asm │ │ ├── DE_00_F64.asm │ │ ├── DE_01_F64.asm │ │ ├── DE_02_F64.asm │ │ ├── DE_04_F64.asm │ │ ├── DE_05_F64.asm │ │ ├── DE_06_F64.asm │ │ ├── DE_07_F64.asm │ │ ├── DE_C0_F64.asm │ │ ├── DE_C8_F64.asm │ │ ├── DE_E0_F64.asm │ │ ├── DE_E8_F64.asm │ │ ├── DE_F0_F64.asm │ │ ├── DE_F8_F64.asm │ │ ├── DF_00_F64.asm │ │ ├── DF_01_F64.asm │ │ ├── DF_02_F64.asm │ │ ├── DF_03_F64.asm │ │ ├── DF_04_F64.asm │ │ ├── DF_05_F64.asm │ │ ├── DF_07_F64.asm │ │ ├── DF_E0_F64.asm │ │ ├── FCOM_F64.asm │ │ ├── FILD_NEG_F64.asm │ │ ├── FIST_F64.asm │ │ ├── FLDCW_F64.asm │ │ ├── FLD_F64.asm │ │ ├── FPREM1_Flags_F64.asm │ │ ├── FPREM_Flags_F64.asm │ │ ├── FScale-Zero_F64.asm │ │ ├── FScaleFXtract_F64.asm │ │ ├── FXAM_Push_2_F64.asm │ │ ├── FXAM_Push_F64.asm │ │ ├── Rounding_F64.asm │ │ ├── fptan_neg_zero_F64.asm │ │ ├── fptan_pos_zero_F64.asm │ │ ├── fsin_neg_zero_F64.asm │ │ ├── fsin_pos_zero_F64.asm │ │ └── fsincos_neg_zero_F64.asm │ ├── fadd.asm │ ├── fld.asm │ ├── full_pshufd_imm.asm │ ├── full_vpblendw_imm.asm │ ├── jump.asm │ ├── lea.asm │ ├── modrm_oob/ │ │ ├── DDD.asm │ │ ├── H0F38.asm │ │ ├── H0F3A.asm │ │ ├── Primary.asm │ │ ├── PrimaryGroup.asm │ │ ├── Secondary.asm │ │ ├── SecondaryGroup.asm │ │ ├── SecondaryModRM.asm │ │ ├── SecondaryOpSize.asm │ │ ├── SecondaryREP.asm │ │ ├── SecondaryREPNE.asm │ │ ├── VEX.asm │ │ ├── VEXGroup.asm │ │ ├── X87.asm │ │ └── X87_Reduced.asm │ ├── mov.asm │ ├── movups.asm │ ├── movzx.asm │ ├── pslldq.asm │ └── x87_stack.asm ├── CMakeLists.txt ├── Example.asm ├── FEXLinuxTests/ │ ├── CMakeLists.txt │ ├── Disabled_Tests │ ├── Disabled_Tests_Host │ ├── Expected_Output │ ├── Flake_Tests │ ├── Known_Failures │ └── tests/ │ ├── CMakeLists.txt │ ├── cpu/ │ │ └── cpu_count.cpp │ ├── fd/ │ │ └── test_close_range.cpp │ ├── fs/ │ │ └── self_symlink.cpp │ ├── include/ │ │ ├── fpstate.h │ │ └── simple_x86.h │ ├── signal/ │ │ ├── Syscall_state.32.cpp │ │ ├── Syscall_state.64.cpp │ │ ├── SystemInstructions.64.cpp │ │ ├── eflags_signal.cpp │ │ ├── into.32.cpp │ │ ├── invalid_hlt.cpp │ │ ├── invalid_int.cpp │ │ ├── invalid_int1.cpp │ │ ├── invalid_int3.cpp │ │ ├── invalid_ud2.cpp │ │ ├── invalid_util.h │ │ ├── invalid_vex.32.cpp │ │ ├── noexec_protect.64.cpp │ │ ├── pthread_cancel.cpp │ │ ├── sigill_flags.cpp │ │ ├── sigill_xstate_magic.cpp │ │ ├── signal_df_reset.64.cpp │ │ ├── signal_flags.cpp │ │ ├── signal_order.cpp │ │ ├── sigtest_defer.cpp │ │ ├── sigtest_no_defer.cpp │ │ ├── sigtest_samask.cpp │ │ ├── sigtest_siginfo.32.cpp │ │ ├── sigtest_siginfo.64.cpp │ │ ├── sigtest_sigmask.cpp │ │ ├── synchronous-signal-block.cpp │ │ ├── timer-sigev-thread.cpp │ │ ├── trap_flag.cpp │ │ └── x87_state.64.cpp │ ├── smc/ │ │ ├── smc-1-dynamic.cpp │ │ ├── smc-2.cpp │ │ ├── smc-common.h │ │ ├── smc-exec-stack.cpp │ │ ├── smc-missing-gnustack.cpp │ │ ├── smc-mt-1.cpp │ │ ├── smc-mt-2.cpp │ │ ├── smc-shared-1.cpp │ │ ├── smc-shared-2.cpp │ │ └── smc-unexec-stack.cpp │ ├── syscalls/ │ │ ├── execveat_memfd.cpp │ │ ├── futimesat.cpp │ │ ├── personality.cpp │ │ ├── syscall_exit.cpp │ │ ├── syscall_sigaltstack.cpp │ │ └── syscalls_efault.cpp │ ├── thunks/ │ │ └── thunk_testlib.cpp │ └── vdso/ │ └── vdso_test.cpp ├── InstructionCountCI/ │ ├── AFP/ │ │ ├── H0F3A.json │ │ ├── SVE256/ │ │ │ ├── Secondary.json │ │ │ ├── Secondary_REP.json │ │ │ └── Secondary_REPNE.json │ │ ├── Secondary.json │ │ ├── Secondary_REP.json │ │ ├── Secondary_REPNE.json │ │ ├── VEX_map1.json │ │ └── VEX_map3.json │ ├── AVX128/ │ │ ├── FMA4.json │ │ ├── VEX_map1.json │ │ ├── VEX_map1_FCMA.json │ │ ├── VEX_map1_SVE128.json │ │ ├── VEX_map1_flagm.json │ │ ├── VEX_map2.json │ │ ├── VEX_map2_AFP.json │ │ ├── VEX_map2_SVE128.json │ │ ├── VEX_map2_flagm.json │ │ ├── VEX_map3.json │ │ ├── VEX_map3_SVE128.json │ │ └── VEX_map_group.json │ ├── Atomics.json │ ├── CMakeLists.txt │ ├── Crypto/ │ │ ├── H0F38.json │ │ └── H0F3A.json │ ├── DDD.json │ ├── FEXOpt/ │ │ ├── AddressingLimitations.json │ │ ├── AddressingLimitations_32Bit.json │ │ ├── MultiInst.json │ │ ├── MultiInst_32bit.json │ │ ├── MultiInst_AFP.json │ │ ├── MultiInst_TSO.json │ │ ├── MultiInst_TSO_32bit.json │ │ └── libnss.json │ ├── FlagM/ │ │ ├── Atomics.json │ │ ├── FlagOpts.json │ │ ├── H0F38.json │ │ ├── HotBlocks.json │ │ ├── HotBlocks_32Bit.json │ │ ├── HotBlocks_AFP.json │ │ ├── HotBlocks_TSO_32Bit.json │ │ ├── Primary.json │ │ ├── PrimaryGroup.json │ │ ├── Primary_32Bit.json │ │ ├── Secondary.json │ │ ├── SecondaryGroup.json │ │ ├── SecondaryModRM.json │ │ ├── Secondary_OpSize.json │ │ ├── Secondary_REP.json │ │ ├── Secondary_REP_CSSC.json │ │ ├── VEX_map1.json │ │ ├── VEX_map2.json │ │ ├── VEX_map_group.json │ │ ├── x87-Crysis2Max-fmodel.json │ │ ├── x87-HalfLife.json │ │ ├── x87-Oblivion.json │ │ ├── x87-Psychonauts.json │ │ ├── x87.json │ │ ├── x87_f64-Crysis2Max-fmodel.json │ │ ├── x87_f64-HalfLife.json │ │ ├── x87_f64-Oblivion.json │ │ ├── x87_f64-Psychonauts.json │ │ └── x87_f64.json │ ├── H0F38.json │ ├── H0F3A.json │ ├── H0F3A_SVE128.json │ ├── MOPS/ │ │ └── Primary.json │ ├── Primary.json │ ├── PrimaryGroup.json │ ├── Primary_32Bit.json │ ├── RPRES/ │ │ ├── DDD.json │ │ ├── Secondary.json │ │ ├── Secondary_REP_AFP.json │ │ └── VEX_map1_AFP.json │ ├── Repeat.json │ ├── SSE42_Strings.json │ ├── Secondary.json │ ├── SecondaryGroup.json │ ├── SecondaryModRM.json │ ├── Secondary_32Bit.json │ ├── Secondary_OpSize.json │ ├── Secondary_OpSize_FCMA.json │ ├── Secondary_OpSize_SVE128.json │ ├── Secondary_OpSize_SVE256.json │ ├── Secondary_REP.json │ ├── Secondary_REPNE.json │ ├── Secondary_REPNE_FCMA.json │ ├── Secondary_REPNE_SVE128.json │ ├── Secondary_REP_FRINTTS.json │ ├── Secondary_SVE128.json │ ├── VEX_map1.json │ ├── VEX_map1_FCMA.json │ ├── VEX_map1_FRINTTS.json │ ├── VEX_map2.json │ ├── VEX_map2_svebitperm.json │ ├── VEX_map3.json │ ├── VEX_map_group.json │ ├── X87ldst-SVE.json │ ├── x87.json │ ├── x87_32Bit.json │ ├── x87_f64.json │ └── x87_f64_32Bit.json ├── POSIX/ │ ├── CMakeLists.txt │ ├── Disabled_Tests │ ├── Expected_Output │ ├── Flake_Tests │ └── Known_Failures ├── Readme.md ├── ThunkFunctionalTests/ │ └── CMakeLists.txt ├── ThunkLibs/ │ ├── CMakeLists.txt │ ├── abi.cpp │ ├── common.h │ └── generator.cpp ├── Utilities/ │ ├── CMakeLists.txt │ └── DeleteOldSHMRegions.cpp ├── gcc-target-tests-32/ │ ├── CMakeLists.txt │ ├── Disabled_Tests │ ├── Expected_Output │ └── Known_Failures ├── gcc-target-tests-64/ │ ├── CMakeLists.txt │ ├── Disabled_Tests │ ├── Expected_Output │ └── Known_Failures └── gvisor-tests/ ├── CMakeLists.txt ├── Disabled_Tests ├── Expected_Output ├── Flake_Tests └── Known_Failures ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ Language: Cpp BasedOnStyle: WebKit AccessModifierOffset: -2 AlignAfterOpenBracket: Align AlignArrayOfStructures: None AlignConsecutiveAssignments: None AlignConsecutiveBitFields: Consecutive AlignConsecutiveDeclarations: None AlignConsecutiveMacros: None AlignEscapedNewlines: Left AlignOperands: Align AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortCaseLabelsOnASingleLine: true AllowShortEnumsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: WithoutElse AllowShortLambdasOnASingleLine: Inline AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true AttributeMacros: - JEMALLOC_NOTHROW - FEX_ALIGNED - FEX_ANNOTATE - FEX_DEFAULT_VISIBILITY - FEX_NAKED - FEX_PACKED - FEXCORE_PRESERVE_ALL_ATTR - GLIBC_ALIAS_FUNCTION BinPackArguments: true BinPackParameters: true BitFieldColonSpacing: Both BreakAfterAttributes: Leave BreakBeforeBraces: Attach BreakBeforeBinaryOperators: None BreakBeforeInlineASMColon: OnlyMultiline # clang 16 required BreakBeforeTernaryOperators: false BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeColon ColumnLimit: 140 CompactNamespaces: false ConstructorInitializerIndentWidth: 2 ContinuationIndentWidth: 2 Cpp11BracedListStyle: true DerivePointerAlignment: false EmptyLineAfterAccessModifier: Leave EmptyLineBeforeAccessModifier: Leave ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: false IndentCaseLabels: false IndentExternBlock: AfterExternBlock IndentGotoLabels: false IndentPPDirectives: None IndentRequires: false IndentWidth: 2 InsertBraces: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature LineEnding: LF # clang 16 required MaxEmptyLinesToKeep: 2 NamespaceIndentation: Inner QualifierAlignment: Left PackConstructorInitializers: Never PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 2 PenaltyBreakOpenParenthesis: 2 PenaltyBreakString: 10 PenaltyBreakTemplateDeclaration: 8 PenaltyExcessCharacter: 2 PenaltyReturnTypeOnItsOwnLine: 16 PointerAlignment: Left RemoveBracesLLVM: false ReferenceAlignment: Left ReflowComments: true RequiresClausePosition: WithPreceding SeparateDefinitionBlocks: Leave SortIncludes: Never SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Custom SpaceBeforeParensOptions: AfterControlStatements: true AfterFunctionDeclarationName: false AfterFunctionDefinitionName: false AfterOverloadedOperator: false AfterRequiresInClause: true BeforeNonEmptyParentheses: false SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Leave SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInParentheses: false Standard: c++20 UseTab: Never ================================================ FILE: .clang-format-ignore ================================================ # This file is used to ignore files and directories from clang-format Source/Common/cpp-optparse/* # Files with human-indented tables for readability - don't mess with these FEXCore/Source/Interface/Core/X86Tables/*.cpp # Inline headers with list-like content that can't be processed individually Source/Tools/LinuxEmulation/LinuxSyscalls/x*/SyscallsNames.inl Source/Tools/LinuxEmulation/LinuxSyscalls/x*/Ioctl/*.inl # Include files in unittests unittests/*ASM/Includes/*.inc ================================================ FILE: .git-blame-ignore-revs ================================================ # Since version 2.23 (released in August 2019), git-blame has a feature # to ignore or bypass certain commits. # # This file contains a list of commits that are not likely what you # are looking for in a blame, such as mass reformatting or renaming. # You can set this file as a default ignore file for blame by running # the following command. # # $ git config blame.ignoreRevsFile .git-blame-ignore-revs # Whole tree reformat PR#3571 2b4ec88daebd35fefb5bf5c73d7fc2b4155771ed # Second reformat to find fixed point PR#3577 905aa935f5ce344a48ef4d5edab3c31efa8d793e # Reformat of CodeEmitter inl files 8760c593ece92d7e9fa94c40da0368fd367c9cad # Whole-tree reformat with clang-format-19 5267cde60e7642852d18f20ae8568643bb5293d5 # Minor reformat with clang-format-19 9fdd96af61c969cb5732471223f00eda64b7a069 # Reformat of X86Tables.h ba2b0ef809f66f1a6d334f000798fa2ceafab26f ================================================ FILE: .github/ISSUE_TEMPLATE/potential-game-bug.md ================================================ --- name: Potential Game Bug about: A bug in FEX-Emu that causes a problem in a game title: "[Game]: [Short Problem Description]" labels: Game related assignees: '' --- **What Game** The game name. A link to the storefront where to get the game. GOG, Steam, Itch.io, etc **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots and Video** If applicable, add screenshots and video to help explain your problem. **System information:** - OS: [eg: Ubuntu 21.10] - CPU/SoC: [eg: Snapdragon 888, Intel Core i8-12900k] - Video driver version: [eg: OpenGL ES 3.2 Mesa 22.0.0-devel (git-9ff086052a)] - RootFS used: [eg: Ubuntu 21.10 Official Rootfs] - FEX version: (FEXGetConfig --version) [eg: FEX-2112-155-gc691d709] - Thunks Enabled: [Yes/No] **Additional context** - Is this an x86 or x86-64 game: [x86/x86-64/Both] - Does this reproduce on AArch64 with Radeon/Intel/Nvidia: [Yes/No/Untested] - Is this a Vulkan game: [Yes/No/Unknown] - If Yes, What is your Vulkan driver: Add any other context about the problem here. ================================================ FILE: .github/workflows/ccpp.yml ================================================ name: Build + Test on: push: branches: - main pull_request: branches: - main env: # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) BUILD_TYPE: Release CC: clang CXX: clang++ FEX_PORTABLE: 1 jobs: build_plus_test: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, ARMv8.0], [self-hosted, ARMv8.2], [self-hosted, ARMv8.4]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner info run: | echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV echo "runner_name=$(hostname)" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env - name: Configure CMake run: | cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE -G Ninja -DENABLE_LTO=False -DENABLE_ASSERTIONS=True \ -DENABLE_X86_HOST_DEBUG=True -DBUILD_FEX_LINUX_TESTS=True -DBUILD_THUNKS=True \ -DCMAKE_INSTALL_PREFIX="$PWD"/build/install # These steps make a lot of noise but rarely fail. # Put them in a separate step to make normal build logs easier to parse - name: Noisy Build Targets run: cmake --build build --target asm_files 32bit_asm_files JemallocLibs Catch2 vixl cephes_128bit - name: Build id: build run: cmake --build build - name: Install run: cmake --build build --target install # GCC tests - name: GCC64 Target Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: gcc_target_tests_64 - name: GCC32 Target Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: gcc_target_tests_32 # API tests - name: API Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: api_tests - name: FEXCore API Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: fexcore_apitests # ARM emission tests - name: ARM Emitter Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: emitter_tests # Linux tests - name: FEX Linux Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: fex_linux_tests_all env: FEX_PORTABLE: 0 # Thunking - name: Thunkgen tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: thunkgen_tests - name: Test GL No-Thunks if: ${{ steps.build.outcome == 'success' && matrix.arch[1] == 'x64' }} uses: ./.github/workflows/test with: target: thunk_functional_tests_nothunks env: DISPLAY: ':0' - name: Test GL Thunks if: ${{ steps.build.outcome == 'success' && matrix.arch[1] == 'x64' }} uses: ./.github/workflows/test with: target: thunk_functional_tests_thunks env: DISPLAY: ':0' # ASM tests - name: ASM Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: asm_tests # POSIX tests - name: POSIX Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: posix_tests # GVisor tests - name: GVisor Tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: gvisor_tests # Struct verifier tests - name: Struct verifier tests if: steps.build.outcome == 'success' uses: ./.github/workflows/test with: target: struct_verifier - name: Remove old SHM regions if: ${{ always() }} run: cmake --build build --target remove_old_shm_regions - name: Upload results if: ${{ always() }} uses: actions/upload-artifact@v6 timeout-minutes: 1 with: name: Results-${{ env.runner_name }}-${{ env.runner_label }} path: results/*.log retention-days: 3 ================================================ FILE: .github/workflows/glibc_fault.yml ================================================ name: GLIBC fault test # This workflow file is the same as the `Build + Test` with some key differences # - Runs on any x86 and ARM64 runner # - Disables the glibc jemalloc compile option # - Enables the glibc allocator fault option # - Disables gvisor tests to reduce stress on CI machines (tmp/shm tests overwhelm them) # - Disables thunk tests since they are incompatible with glibc fault allocator # - Disables ARMEmitter tests (We don't want to fault test vixl's disassembler) on: push: branches: - main pull_request: branches: - main env: # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) BUILD_TYPE: Release CC: clang CXX: clang++ FEX_PORTABLE: 1 jobs: glibc_fault_test: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, ARM64]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner info run: | echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV echo "runner_name=$(hostname)" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env - name: Configure CMake run: | cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE -G Ninja -DENABLE_LTO=False \ -DENABLE_ASSERTIONS=True -DENABLE_X86_HOST_DEBUG=True -DBUILD_FEX_LINUX_TESTS=True \ -DENABLE_GLIBC_ALLOCATOR_HOOK_FAULT=True -DENABLE_JEMALLOC_GLIBC_ALLOC=False \ -DCMAKE_INSTALL_PREFIX="$PWD"/build/install # These steps make a lot of noise but rarely fail. # Put them in a separate step to make normal build logs easier to parse - name: Noisy Build Targets run: cmake --build build --target asm_files 32bit_asm_files JemallocLibs Catch2 vixl cephes_128bit - name: Build run: cmake --build build - name: Install run: cmake --build build --target install # GCC tests - name: GCC64 Target Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: gcc_target_tests_64 - name: GCC32 Target Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: gcc_target_tests_32 # API Tests - name: API Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: api_tests - name: FEXCore API Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: fexcore_apitests # Linux tests - name: FEX Linux Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: fex_linux_tests_all # ASM Tests - name: ASM Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: asm_tests # POSIX Tests - name: POSIX Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: posix_tests - name: Remove old SHM regions if: ${{ always() }} run: cmake --build build --target remove_old_shm_regions - name: Upload results if: ${{ always() }} uses: actions/upload-artifact@v6 timeout-minutes: 1 with: name: Results-${{ env.runner_name }}-${{ env.runner_label }} path: results/*.log retention-days: 3 ================================================ FILE: .github/workflows/hostrunner.yml ================================================ name: Hostrunner tests on: push: branches: - main pull_request: branches: - main env: # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) BUILD_TYPE: Release CC: clang CXX: clang++ FEX_PORTABLE: 1 jobs: hostrunner_tests: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, x64]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner info run: | echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV echo "runner_name=$(hostname)" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env - name: Configure CMake run: | cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE -G Ninja -DENABLE_LTO=False \ -DENABLE_ASSERTIONS=True -DENABLE_X86_HOST_DEBUG=True # These steps make a lot of noise but rarely fail. # Put them in a separate step to make normal build logs easier to parse - name: Noisy Build Targets run: cmake --build build --target asm_files 32bit_asm_files JemallocLibs Catch2 vixl cephes_128bit - name: Build run: cmake --build build # ASM tests - name: ASM Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: asm_tests - name: Upload results if: ${{ always() }} uses: actions/upload-artifact@v6 timeout-minutes: 1 with: name: Results-${{ env.runner_name }}-${{ env.runner_label }} path: results/*.log retention-days: 3 ================================================ FILE: .github/workflows/instcountci.yml ================================================ name: Instruction Count CI run on: push: branches: - main pull_request: branches: - main env: # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) BUILD_TYPE: Release CC: clang CXX: clang++ jobs: instcountci_tests: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, x64], [self-hosted, ARM64]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner info run: | echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV echo "runner_name=$(hostname)" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env - name: Set VIXL_SIM_ENABLED run: | case '${{ matrix.arch[1] }}' in x64) _sim=True ;; ARM64) _sim=False ;; esac echo "VIXL_SIM_ENABLED=$_sim" >> $GITHUB_ENV - name: Configure CMake run: | cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE -G Ninja -DENABLE_VIXL_SIMULATOR=$VIXL_SIM_ENABLED \ -DENABLE_VIXL_DISASSEMBLER=True -DENABLE_LTO=False -DENABLE_ASSERTIONS=True -DENABLE_X86_HOST_DEBUG=True - name: Build env: FEX_DISABLETELEMETRY: 1 run: cmake --build build --target CodeSizeValidation instcountci_test_files - name: Instruction Count Tests if: ${{ always() }} uses: ./.github/workflows/test with: target: instcountci_tests - name: Update local repo instcount if: ${{ always() }} run: cmake --build build --target instcountci_update_tests - name: Check InstCountCI diff if: ${{ always() }} run: git --no-pager diff --exit-code HEAD - name: Upload results if: ${{ always() }} uses: actions/upload-artifact@v6 timeout-minutes: 1 with: name: Results-${{ env.runner_name }}-${{ env.runner_label }} path: results/*.log retention-days: 3 ================================================ FILE: .github/workflows/mingw_build.yml ================================================ name: Mingw build on: push: branches: - main pull_request: branches: - main env: BUILD_TYPE: Debug jobs: mingw_build: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, ARM64, mingw], [self-hosted, ARM64EC, mingw, ARM64]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner label run: echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV - name: Add MingGW to PATH run: echo "$HOME/llvm-mingw/build/bin/" >> $GITHUB_PATH - name: Set CC run: | case '${{ matrix.arch[1] }}' in x64) _cpu=x86_64 ;; ARM64) _cpu=aarch64 ;; ARM64EC) _cpu=arm64ec ;; esac echo "MINGW_TRIPLE=${_cpu}-w64-mingw32" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env - name: Configure CMake run: | cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/Data/CMake/toolchain_mingw.cmake \ -DMINGW_TRIPLE=$MINGW_TRIPLE -G Ninja -DENABLE_LTO=False -DENABLE_ASSERTIONS=True -DENABLE_X86_HOST_DEBUG=True -DBUILD_TESTING=False \ -DCMAKE_INSTALL_PREFIX="$PWD"/build/install - name: Build run: cmake --build build ================================================ FILE: .github/workflows/pr-code-format.yml ================================================ # Inspired by LLVM's pr-code-format.yml at # https://github.com/llvm/llvm-project/blob/main/.github/workflows/pr-code-format.yml name: Check code formatting on: pull_request: branches: - main jobs: code_formatter: runs-on: [self-hosted, X64] if: github.repository == 'FEX-Emu/FEX' steps: - name: Checkout uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - name: Checkout through merge base uses: rmacklin/fetch-through-merge-base@v0 timeout-minutes: 3 with: base_ref: ${{ github.event.pull_request.base.ref }} head_ref: ${{ github.event.pull_request.head.sha }} deepen_length: 500 - name: Get changed files run: | BASE=$(git merge-base main HEAD) FILES=$(git diff --name-only "$BASE" | tr '\n' ',' | sed 's/,$//') echo "CHANGED_FILES=$FILES" >> $GITHUB_ENV echo "Changed files:" echo "$FILES" - name: Check git-clang-format-19 exists run: which git-clang-format-19 - name: Setup Python env uses: actions/setup-python@v4 with: python-version: 3.11 cache: pip cache-dependency-path: ./External/code-format-helper/requirements_formatting.txt - name: Install python dependencies run: pip install -r ./External/code-format-helper/requirements_formatting.txt - name: Run code formatter env: CLANG_FORMAT_PATH: git-clang-format-19 GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} START_REV: ${{ github.event.pull_request.base.sha }} END_REV: ${{ github.event.pull_request.head.sha }} run: | python ./External/code-format-helper/code-format-helper.py \ --repo "FEX-Emu/FEX" \ --issue-number "$GITHUB_PR_NUMBER" \ --start-rev "$START_REV" \ --end-rev "$END_REV" \ --changed-files "$CHANGED_FILES" ================================================ FILE: .github/workflows/setup-env/action.yml ================================================ name: Setup Build Environment description: Setup RootFS and build environment inputs: setup-rootfs: description: 'Whether or not to set up the rootfs' default: true runs: using: composite steps: - name: Set rootfs paths if: ${{ inputs.setup-rootfs == 'true' }} shell: bash run: | echo "FEX_ROOTFS_MOUNT=/mnt/AutoNFS/rootfs/" >> $GITHUB_ENV echo "FEX_ROOTFS_PATH=$HOME/Rootfs/" >> $GITHUB_ENV echo "FEX_ROOTFS=$HOME/Rootfs/" >> $GITHUB_ENV - name: Update RootFS cache if: ${{ inputs.setup-rootfs == 'true' }} shell: bash run: python3 Scripts/CI_FetchRootFS.py - name: Checkout Submodules shell: bash run: | git submodule sync --recursive git submodule update --init --depth 1 - name: Clean Build Environment shell: bash run: rm -Rf build ================================================ FILE: .github/workflows/steamrt4.yml ================================================ name: steamrt4 build on: push: branches: - main pull_request: branches: - main env: DEBIAN_FRONTEND: noninteractive BUILD_TYPE: Release CC: clang CXX: clang++ jobs: steamrt4_build: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, ARM64, distrobox]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner label run: echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env with: setup-rootfs: false # Setup everything required. - name : distrobox setup run: | distrobox create -Y -i registry.gitlab.steamos.cloud/steamrt/steamrt4/sdk/arm64:4.0.20251117.183306 steamrt4 || true distrobox upgrade steamrt4 distrobox enter --name steamrt4 -- sudo apt-get install -y \ git cmake ninja-build ccache \ lld clang \ libclang-dev llvm-dev \ libstdc++-14-dev-i386-cross libgcc-14-dev-i386-cross \ libstdc++-14-dev-amd64-cross libgcc-14-dev-amd64-cross - name: Configure CMake run: | distrobox enter --name steamrt4 -- cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -G Ninja -DBUILD_STEAM_SUPPORT=True -DENABLE_LTO=True -DENABLE_ASSERTIONS=False -DBUILD_THUNKS=True \ -DBUILD_FEXCONFIG=False -DBUILD_TESTING=False -DENABLE_CLANG_THUNKS=True -DUSE_LINKER=lld \ -DCMAKE_INSTALL_PREFIX=/usr - name: Build run: distrobox enter --name steamrt4 -- cmake --build build - name: install run: DESTDIR="$PWD"/install distrobox enter --name steamrt4 -- cmake --build build -t install - name: Upload libraries uses: actions/upload-artifact@v6 timeout-minutes: 1 with: overwrite: true name: steamrt4_steampipe_depot path: ${{ github.workspace }}/install/* retention-days: 60 compression-level: 9 ================================================ FILE: .github/workflows/test/action.yml ================================================ name: Run Test and Store Logs description: Run a test and store the log. inputs: target: description: 'The test target to run' required: true runs: using: composite steps: - name: Run Tests shell: bash run: cmake --build build --target ${{ inputs.target }} - name: Move and Truncate Results if: ${{ always() }} shell: bash run: | mkdir -p results mv build/Testing/Temporary/LastTest.log results/${{ inputs.target }}.log || true truncate --size="<20M" results/${{ inputs.target }}.log || true ================================================ FILE: .github/workflows/vixl_simulator.yml ================================================ name: Vixl Simulator run on: push: branches: - main pull_request: branches: - main env: # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) BUILD_TYPE: Release CC: clang CXX: clang++ FEX_PORTABLE: 1 jobs: vixl_simulator: runs-on: ${{ matrix.arch }} strategy: matrix: # Only the x86-64 runner is fast enough to run this arch: [[self-hosted, x64], [self-hosted, ARMv8.4]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Set runner info run: | echo "runner_label=${{ matrix.arch[1] }}" >> $GITHUB_ENV echo "runner_name=$(hostname)" >> $GITHUB_ENV - name: Setup Build Environment uses: ./.github/workflows/setup-env - name: Configure CMake run: | cmake -S . -B build -DCMAKE_BUILD_TYPE=$BUILD_TYPE -G Ninja -DENABLE_VIXL_SIMULATOR=True -DENABLE_LTO=False \ -DENABLE_VIXL_DISASSEMBLER=True -DENABLE_ASSERTIONS=True -DENABLE_X86_HOST_DEBUG=True # These steps make a lot of noise but rarely fail. # Put them in a separate step to make normal build logs easier to parse - name: Noisy Build Targets run: cmake --build build --target asm_files 32bit_asm_files JemallocLibs Catch2 vixl cephes_128bit - name: Build run: cmake --build build - name: ASM Tests - SVE256 if: ${{ always() }} uses: ./.github/workflows/test with: target: asm_tests - name: ASM Tests - SVE128 if: ${{ always() }} uses: ./.github/workflows/test env: FEX_FORCESVEWIDTH: "128" with: target: asm_tests - name: ASM Tests - ASIMD if: ${{ always() }} uses: ./.github/workflows/test env: FEX_HOSTFEATURES: "disablesve" with: target: asm_tests - name: Upload results if: ${{ always() }} uses: actions/upload-artifact@v6 timeout-minutes: 1 with: name: Results-${{ env.runner_name }}-${{ env.runner_label }} path: results/*.log retention-days: 3 ================================================ FILE: .github/workflows/wine_build/action.yml ================================================ name: Wine DLL Build description: Build a wow64 or arm64ec Wine DLL inputs: target: description: 'The target (arm64ec or wow64)' required: true runs: using: composite steps: - name: Clean Build Environment shell: bash run: rm -Rf build_${{ inputs.target }} - name: Configure CMake shell: bash run: | case "${{ inputs.target }}" in wow64) _cc=aarch64 ;; arm64ec) _cc=arm64ec ;; esac cmake -S . -B build_${{ inputs.target }} -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_TOOLCHAIN_FILE=Data/CMake/toolchain_mingw.cmake \ -DMINGW_TRIPLE=${_cc}-w64-mingw32 -DCMAKE_INSTALL_LIBDIR=/usr/lib/wine/aarch64-windows -G Ninja \ -DENABLE_LTO=False -DENABLE_ASSERTIONS=False -DENABLE_JEMALLOC_GLIBC_ALLOC=False \ -DBUILD_TESTING=False -DCMAKE_INSTALL_PREFIX=/usr -DTUNE_ARCH=generic -DTUNE_CPU=none - name: Build shell: bash run: cmake --build build_${{ inputs.target }} - name: Install shell: bash run: DESTDIR="$PWD"/install cmake --build build_${{ inputs.target }} -t install ================================================ FILE: .github/workflows/wine_dll_artifacts.yml ================================================ name: Wine DLL artifacts on: push: branches: - main env: BUILD_TYPE: Release jobs: wine_dll_artifacts: runs-on: ${{ matrix.arch }} strategy: matrix: arch: [[self-hosted, ARM64, mingw]] fail-fast: false steps: - uses: actions/checkout@v6 with: fetch-depth: '0' fetch-tags: 'true' - name: Add MingGW to PATH run: echo "$HOME/llvm-mingw/build/bin/" >> $GITHUB_PATH - name: Checkout Submodules # Need to update submodules run: | git submodule sync --recursive git submodule update --init --depth 1 - name: Clean install directory run: rm -Rf install - name: Build (wow64) uses: ./.github/workflows/wine_build with: target: wow64 - name: Build (arm64ec) uses: ./.github/workflows/wine_build with: target: arm64ec - name: Upload libraries uses: actions/upload-artifact@v6 timeout-minutes: 1 with: overwrite: true name: wine_dll_artifacts path: ${{ github.workspace }}/install/usr/lib/wine/aarch64-windows/lib*.dll retention-days: 60 compression-level: 9 ================================================ FILE: .gitignore ================================================ # Existing compile_commands.json vim_rc Config.json [Bb]uild* [Bb]in/ out/ .vscode/ .vs/ *.pyc .cache .idea/ CMakeLists.txt.user ================================================ FILE: .gitlab-ci.yml ================================================ spec: inputs: PROMOTE_BRANCH: description: "Branch to promote the build to. Empty means no promotion." default: "bleeding-edge" --- workflow: rules: - when: always variables: PROMOTE_BRANCH: $[[ inputs.PROMOTE_BRANCH ]] variables: DEBIAN_FRONTEND: noninteractive GIT_SUBMODULE_STRATEGY: recursive GIT_DEPTH: 0 CC: clang CXX: clang++ build: stage: build image: registry.gitlab.steamos.cloud/steamrt/steamrt4/sdk/arm64:4.0.20251117.183306 tags: - docker - linux - arm64 - aarch64 script: - apt-get -y update - apt-get install -y git cmake ninja-build ccache lld clang libclang-dev llvm-dev libstdc++-14-dev-i386-cross libgcc-14-dev-i386-cross libstdc++-14-dev-amd64-cross libgcc-14-dev-amd64-cross - cmake -E make_directory build/ - cmake -DCMAKE_BUILD_TYPE=Release -G Ninja -DBUILD_STEAM_SUPPORT=True -DENABLE_LTO=True -DENABLE_ASSERTIONS=False -DBUILD_THUNKS=True -DBUILD_FEXCONFIG=False -DBUILD_TESTING=False -DENABLE_CLANG_THUNKS=True -DUSE_LINKER=lld -DCMAKE_INSTALL_PREFIX=/usr -DTUNE_ARCH=armv8.2-a -DTUNE_CPU=none . -B build/ - cmake --build build/ --config Release - DESTDIR=$(pwd)/install/ cmake --build build/ --config Release -t install artifacts: name: "steamrt artifacts" untracked: false paths: - install/ promote: stage: deploy variables: GIT_STRATEGY: none image: registry.gitlab.steamos.cloud/steamrt/steamrt4/sdk/arm64:4.0.20251117.183306 tags: - docker - linux - arm64 - aarch64 rules: - if: '$PROMOTE_BRANCH' before_script: - apt-get -y update - apt-get install -y tmux curl script: # comment out to debug: SSH in via GCP, go down the container and attach to the session (with `tmux attach -t debug`) # - tmux new-session -d -s debug # - while tmux has-session -t debug 2>/dev/null; do sleep 1; done # ref controls which fex-depot code runs the pipeline, while VERSION_PARAM controls which fex branch's artifacts that pipeline downloads. - > curl --fail --location --request POST --form token=${FEX_DEPOT_TRIGGER_TOKEN} --form ref=master --form "variables[PROMOTE_BRANCH]=${PROMOTE_BRANCH}" --form "variables[VERSION_PARAM]=${CI_COMMIT_REF_NAME}" "${CI_API_V4_URL}/projects/fex%2Ffex-depot/trigger/pipeline" ================================================ FILE: .gitmodules ================================================ [submodule "External/vixl"] shallow = true path = External/vixl url = https://github.com/FEX-Emu/vixl.git [submodule "External/cpp-optparse"] path = Source/Common/cpp-optparse url = https://github.com/Sonicadvance1/cpp-optparse [submodule "External/fex-posixtest-bins"] shallow = true path = External/fex-posixtest-bins url = https://github.com/FEX-Emu/fex-posixtest-bins.git [submodule "External/fex-gvisor-tests-bins"] shallow = true path = External/fex-gvisor-tests-bins url = https://github.com/FEX-Emu/fex-gvisor-tests-bins.git [submodule "External/fex-gcc-target-tests-bins"] shallow = true path = External/fex-gcc-target-tests-bins url = https://github.com/FEX-Emu/fex-gcc-target-tests-bins.git [submodule "External/fmt"] path = External/fmt url = https://github.com/fmtlib/fmt.git [submodule "External/drm-headers"] path = External/drm-headers url = https://github.com/FEX-Emu/drm-headers.git [submodule "External/xxhash"] path = External/xxhash url = https://github.com/Cyan4973/xxHash.git [submodule "External/Catch2"] path = External/Catch2 url = https://github.com/catchorg/Catch2.git [submodule "External/Vulkan-Headers"] shallow = true path = External/Vulkan-Headers url = https://github.com/KhronosGroup/Vulkan-Headers.git [submodule "External/jemalloc_glibc"] path = External/jemalloc_glibc url = https://github.com/FEX-Emu/jemalloc.git [submodule "External/tracy"] path = External/tracy url = https://github.com/wolfpld/tracy [submodule "External/range-v3"] path = External/range-v3 url = https://github.com/ericniebler/range-v3.git [submodule "External/zydis"] shallow = true path = External/zydis url = https://github.com/zyantific/zydis.git [submodule "External/unordered_dense"] path = External/unordered_dense url = https://github.com/martinus/unordered_dense.git [submodule "External/rpmalloc"] path = External/rpmalloc url = https://github.com/FEX-Emu/rpmalloc.git ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) project(FEX C CXX ASM) include(CheckIncludeFiles) check_include_files("gdb/jit-reader.h" HAVE_GDB_JIT_READER_H) option(BUILD_FEX_LINUX_TESTS "Build FEXLinuxTests (requires x86 compiler)" FALSE) option(BUILD_THUNKS "Build thunks" FALSE) option(BUILD_FEXCONFIG "Build FEXConfig" TRUE) option(ENABLE_CLANG_THUNKS "Build thunks with clang" TRUE) option(ENABLE_IWYU "Enable the Include What You Use sanitizer" FALSE) option(ENABLE_LTO "Enable LTO with compilation" TRUE) option(ENABLE_XRAY "Enable building with LLVM X-Ray" FALSE) set(USE_LINKER "" CACHE STRING "Path to a custom linker program") option(ENABLE_UBSAN "Enable the Clang Undefined Behavior Sanitizer" FALSE) option(ENABLE_ASAN "Enable the Clang Address Sanitizer" FALSE) option(ENABLE_TSAN "Enable the Clang Thread Sanitizer" FALSE) option(ENABLE_COVERAGE "Enable Code Coverage" FALSE) option(ENABLE_ASSERTIONS "Enable debug assertions" FALSE) option(ENABLE_GDB_SYMBOLS "Enable GDBSymbols integration support" ${HAVE_GDB_JIT_READER_H}) option(ENABLE_STRICT_WERROR "Enable stricter -Werror" FALSE) option(ENABLE_WERROR "Enable -Werror" FALSE) option(ENABLE_FEX_ALLOCATOR "Enable allocator for FEX" TRUE) option(ENABLE_JEMALLOC_GLIBC_ALLOC "Enable jemalloc glibc allocator" TRUE) option(ENABLE_OFFLINE_TELEMETRY "Enable FEX offline telemetry" TRUE) option(ENABLE_COMPILE_TIME_TRACE "Enable time trace compile option" FALSE) option(ENABLE_LIBCXX "Use LLVM's libc++ instead of the GNU libstdc++" FALSE) option(ENABLE_CCACHE "Enable ccache for build caching" TRUE) option(ENABLE_VIXL_SIMULATOR "Use the VIXL simulator for emulation (only useful for CI testing)" FALSE) option(ENABLE_VIXL_DISASSEMBLER "Enable debug disassembler output with VIXL" FALSE) option(ENABLE_ZYDIS "Enable x86/x86-64 guest disassembler output with Zydis" FALSE) option(USE_LEGACY_BINFMTMISC "Use legacy method of setting up binfmt_misc" FALSE) option(ENABLE_FEXCORE_PROFILER "Enable FEXCore's timeline profiling capabilities" FALSE) set(FEXCORE_PROFILER_BACKEND "gpuvis" CACHE STRING "Set which backend to use for FEXCore's profiler") set_property(CACHE FEXCORE_PROFILER_BACKEND PROPERTY STRINGS gpuvis tracy) option(ENABLE_GLIBC_ALLOCATOR_HOOK_FAULT "Enables glibc memory allocation hooking with fault for CI testing") option(USE_PDB_DEBUGINFO "Build debug info in PDB format" FALSE) option(BUILD_STEAM_SUPPORT "Enable Steam integration" FALSE) set(X86_32_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/Data/CMake/toolchain_x86_32.cmake" CACHE FILEPATH "Toolchain file for the (cross-)compiler targeting i686") set(X86_64_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/Data/CMake/toolchain_x86_64.cmake" CACHE FILEPATH "Toolchain file for the (cross-)compiler targeting x86_64") set(X86_DEV_ROOTFS "/" CACHE FILEPATH "Path to the sysroot used for cross-compiling for i686 and x86_64") set(DATA_DIRECTORY "" CACHE PATH "Global data directory (override)") set(HOSTLIBS_DATA_DIRECTORY "" CACHE PATH "Global data directory (override)") if (NOT DATA_DIRECTORY) set(DATA_DIRECTORY "${CMAKE_INSTALL_PREFIX}/share/fex-emu") endif() include(GNUInstallDirs) if (NOT HOSTLIBS_DATA_DIRECTORY) set(HOSTLIBS_DATA_DIRECTORY "${CMAKE_INSTALL_FULL_LIBDIR}/fex-emu") endif() ## Platform Checks ## # Only 64-bit Linux and Windows are supported # NB: SIZEOF_VOID_P is in bytes, not bits # On 32-bit systems this is set to 4 if (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(FATAL_ERROR "Unsupported pointer size ${CMAKE_SIZEOF_VOID_P}." " FEX only supports 64-bit (8-byte pointer) systems." " If you believe this is in error, file an issue.") elseif (NOT (WIN32 OR CMAKE_SYSTEM_NAME STREQUAL "Linux")) message(FATAL_ERROR "Unsupported system type ${CMAKE_SYSTEM_NAME}." " FEX only supports Linux and Windows." " If you believe this is in error, file an issue.") endif() ## Compiler Checks ## # GCC and MSVC are unsupported if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") message(FATAL_ERROR "FEX doesn't support GCC! Use Clang instead.") elseif (MSVC) message(FATAL_ERROR "FEX doesn't support MSVC! Use Clang on MinGW instead.") elseif (MINGW) message(STATUS "Building for MinGW") set(ENABLE_FEX_ALLOCATOR TRUE) set(ENABLE_JEMALLOC_GLIBC_ALLOC FALSE) else () message(STATUS "Clang version ${CMAKE_CXX_COMPILER_VERSION}") set(CLANG_MINIMUM_VERSION 13.0) if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${CLANG_MINIMUM_VERSION}) message(FATAL_ERROR "Clang version too old for FEX. Need at least ${CLANG_MINIMUM_VERSION} but has ${CMAKE_CXX_COMPILER_VERSION}") endif() endif() ## Architecture Handling ## string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} processor) if (processor MATCHES "x86|amd64") option(ENABLE_X86_HOST_DEBUG "Enables compiling on x86_64 host" FALSE) if (NOT ENABLE_X86_HOST_DEBUG) message(FATAL_ERROR " FEX doesn't support compiling for x86-64 hosts!" " This is /only/ a supported configuration for FEX CI and nothing else!") else() message(STATUS "x86_64 debug build") endif() set(ARCHITECTURE_x86_64 1) add_compile_definitions(ARCHITECTURE_x86_64=1) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcx16") elseif (processor MATCHES "^aarch64|^arm64|^armv8\.*") set(ARCHITECTURE_arm64 1) add_compile_definitions(ARCHITECTURE_arm64=1) # arm64ec needs to define both arm64 and arm64ec if (processor MATCHES "^arm64ec") set(ARCHITECTURE_arm64ec 1) add_compile_definitions(ARCHITECTURE_arm64ec=1) endif() endif() if (NOT (ARCHITECTURE_arm64 OR ARCHITECTURE_arm64ec OR ARCHITECTURE_x86_64)) message(FATAL_ERROR "Unsupported processor type ${processor}." " If you believe this is in error, file an issue.") endif() if (BUILD_STEAM_SUPPORT) add_compile_definitions(FEX_STEAM_SUPPORT=1) endif() if (ENABLE_FEXCORE_PROFILER) add_compile_definitions(ENABLE_FEXCORE_PROFILER=1) string(TOUPPER "${FEXCORE_PROFILER_BACKEND}" FEXCORE_PROFILER_BACKEND) if (FEXCORE_PROFILER_BACKEND STREQUAL "GPUVIS") add_compile_definitions(FEXCORE_PROFILER_BACKEND=1) elseif (FEXCORE_PROFILER_BACKEND STREQUAL "TRACY") add_compile_definitions(FEXCORE_PROFILER_BACKEND=2) add_compile_definitions(TRACY_ENABLE=1) # Required so that Tracy will only start in the selected guest application add_compile_definitions(TRACY_MANUAL_LIFETIME=1) add_compile_definitions(TRACY_DELAYED_INIT=1) # This interferes with FEX's signal handling add_compile_definitions(TRACY_NO_CRASH_HANDLER=1) # Tracy can gather call stack samples in regular intervals, but this # isn't useful for us since it would usually sample opaque JIT code add_compile_definitions(TRACY_NO_SAMPLING=1) # This pulls in libbacktrace which allocators in global constructors (before FEX can set up its allocator hooks) add_compile_definitions(TRACY_NO_CALLSTACK=1) if (MINGW) message(FATAL_ERROR "Tracy profiler not supported on MinGW") endif() else() message(FATAL_ERROR "Unknown FEXCore profiler backend ${FEXCORE_PROFILER_BACKEND}") endif() endif() if (ENABLE_JEMALLOC_GLIBC_ALLOC AND ENABLE_GLIBC_ALLOCATOR_HOOK_FAULT) message(FATAL_ERROR "Can't have both glibc fault allocator and jemalloc glibc allocator enabled at the same time") endif() if (ENABLE_GLIBC_ALLOCATOR_HOOK_FAULT) add_compile_definitions(GLIBC_ALLOCATOR_FAULT=1) endif() # uninstall target if(NOT TARGET uninstall) configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/Data/CMake/cmake_uninstall.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cmake_uninstall.cmake" IMMEDIATE @ONLY) add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cmake_uninstall.cmake) endif() # These options are meant for package management set(TUNE_CPU "native" CACHE STRING "Override the CPU the build is tuned for") set(TUNE_ARCH "generic" CACHE STRING "Override the Arch the build is tuned for") set(OVERRIDE_VERSION "detect" CACHE STRING "Override the FEX version") set(OVERRIDE_HASH "detect" CACHE STRING "Override the FEX git hash") get_property(IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if (NOT IS_MULTI_CONFIG AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) message(STATUS "No build type set, defaulting to a Release build") endif() string(TOUPPER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE) if (CMAKE_BUILD_TYPE MATCHES "DEBUG") set(ENABLE_ASSERTIONS TRUE) endif() if (ENABLE_ASSERTIONS) message(STATUS "Assertions enabled") add_compile_definitions(ASSERTIONS_ENABLED=1) endif() if (ENABLE_GDB_SYMBOLS) message(STATUS "GDBSymbols support enabled") add_compile_definitions(GDB_SYMBOLS_ENABLED=1) endif() set(CMAKE_CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/Bin) set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) cmake_policy(SET CMP0083 NEW) # Follow new PIE policy include(CheckPIESupported) check_pie_supported() set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ${ENABLE_LTO}) include(CheckCXXSourceCompiles) set(CMAKE_REQUIRED_FLAGS "-std=c++11 -Wattributes -Werror=attributes") check_cxx_source_compiles( " __attribute__((preserve_all)) int Testy(int a, int b, int c, int d, int e, int f) { return a + b + c + d + e + f; } int main() { return Testy(0, 1, 2, 3, 4, 5); }" HAS_CLANG_PRESERVE_ALL) unset(CMAKE_REQUIRED_FLAGS) if (HAS_CLANG_PRESERVE_ALL) if (MINGW) message(STATUS "Ignoring broken clang::preserve_all support") set(HAS_CLANG_PRESERVE_ALL FALSE) else() message(STATUS "Has clang::preserve_all") endif() endif() if (ARCHITECTURE_arm64 AND HAS_CLANG_PRESERVE_ALL) add_compile_definitions("FEX_PRESERVE_ALL_ATTR=__attribute__((preserve_all))" "FEX_HAS_PRESERVE_ALL_ATTR=1") else() add_compile_definitions("FEX_PRESERVE_ALL_ATTR=" "FEX_HAS_PRESERVE_ALL_ATTR=0") endif() check_cxx_source_compiles( " #define _GNU_SOURCE #include int main() { return program_invocation_name == nullptr; }" HAS_PROGRAM_INVOCATION_NAME) add_compile_definitions("HAS_PROGRAM_INVOCATION_NAME=${HAS_PROGRAM_INVOCATION_NAME}") if (ENABLE_VIXL_SIMULATOR) # We can run the simulator on both x86-64 or AArch64 hosts add_compile_definitions(VIXL_SIMULATOR=1 VIXL_INCLUDE_SIMULATOR_AARCH64=1) endif() if (ENABLE_CCACHE) find_program(CCACHE_PROGRAM ccache) if(CCACHE_PROGRAM) message(STATUS "CCache enabled") set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}") endif() endif() if (ENABLE_XRAY) add_compile_options(-fxray-instrument) link_libraries(-fxray-instrument) endif() if (ENABLE_COMPILE_TIME_TRACE) add_compile_options(-ftime-trace) link_libraries(-ftime-trace) endif() set(PTHREAD_LIB pthread) if (USE_LINKER) message(STATUS "Overriding linker to: ${USE_LINKER}") add_link_options("-fuse-ld=${USE_LINKER}") endif() if (ENABLE_LIBCXX) message(WARNING "This is an unsupported configuration and should only be used for testing") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -stdlib=libc++") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi") endif() if (NOT ENABLE_OFFLINE_TELEMETRY) # Disable FEX offline telemetry entirely if asked add_compile_definitions(FEX_DISABLE_TELEMETRY=1) endif() if (ENABLE_UBSAN) # See https://github.com/FEX-Emu/FEX/pull/4494#issuecomment-2800608944 # and related discussion for the use of -fno-sanitize=alignment -fno-sanitize=function # with UBSAN. # alignment: we don't follow a strict alignment policy, for example IR uses packed structs # that are regularly access unaligned. # function: syscalls cast function pointers to void (*)(unsigned long...), causing warnings # related to this access. add_compile_definitions(ENABLE_UBSAN=1) add_compile_options(-fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize=function -fno-sanitize-recover=undefined) link_libraries(-fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize=function -fno-sanitize-recover=undefined) endif() if (ENABLE_ASAN) add_compile_definitions(ENABLE_ASAN=1) add_compile_options(-fno-omit-frame-pointer -fsanitize=address -fsanitize-address-use-after-scope) link_libraries(-fno-omit-frame-pointer -fsanitize=address -fsanitize-address-use-after-scope) endif() if (ENABLE_TSAN) add_compile_options(-fno-omit-frame-pointer -fsanitize=thread) link_libraries(-fno-omit-frame-pointer -fsanitize=thread) endif() if (ENABLE_COVERAGE) add_compile_options(-fprofile-instr-generate -fcoverage-mapping) link_libraries(-fprofile-instr-generate -fcoverage-mapping) endif() if (ENABLE_JEMALLOC_GLIBC_ALLOC) # The glibc jemalloc subproject which hooks the glibc allocator. # Required for thunks to work. # All host native libraries will use this allocator, while *most* other FEX internal allocations will use the other jemalloc allocator. add_subdirectory(External/jemalloc_glibc/) elseif (NOT MINGW) message(STATUS " jemalloc glibc allocator disabled!\n" " This is not a recommended configuration!\n" " This will very explicitly break thunk execution!\n" " Use at your own risk!") endif() if (ENABLE_FEX_ALLOCATOR) # The rpmalloc subproject that all FEXCore fextl objects allocate through. add_subdirectory(External/rpmalloc/) elseif (NOT MINGW) message (STATUS " FEX allocator is disabled!\n" " This is not a recommended configuration!\n" " This will very explicitly break 32-bit application execution!\n" " Use at your own risk!") endif() if (USE_PDB_DEBUGINFO) add_compile_options(-g -gcodeview) add_link_options(-g -Wl,--pdb=) endif() set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-omit-frame-pointer") set(CMAKE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_LINKER_FLAGS_RELWITHDEBINFO} -fno-omit-frame-pointer") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fomit-frame-pointer") set(CMAKE_LINKER_FLAGS_RELEASE "${CMAKE_LINKER_FLAGS_RELEASE} -fomit-frame-pointer") ## Modules ## list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/Data/CMake/) include(LinkerGC) ## Externals ## find_package(unordered_dense QUIET CONFIG) if (NOT unordered_dense_FOUND) add_subdirectory(External/unordered_dense) endif() include(CTest) if (BUILD_TESTING OR ENABLE_VIXL_DISASSEMBLER OR ENABLE_VIXL_SIMULATOR) add_subdirectory(External/vixl/) endif() if (ENABLE_ZYDIS) find_package(Zycore 1.5 MODULE QUIET) find_package(Zydis 4.0 MODULE QUIET) if (TARGET Zydis::Zydis AND TARGET Zycore::Zycore) message(STATUS "Using system Zydis") else() set(ZYDIS_BUILD_TOOLS OFF CACHE BOOL "" FORCE) set(ZYDIS_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) message(STATUS "Using bundled Zydis") add_subdirectory(External/zydis/) endif() endif() if (ENABLE_FEXCORE_PROFILER AND FEXCORE_PROFILER_BACKEND STREQUAL "TRACY") add_subdirectory(External/tracy) endif() find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) set(BUILD_SHARED_LIBS OFF) if (NOT CMAKE_CROSSCOMPILING) find_package(xxhash MODULE QUIET) endif() if (NOT TARGET xxHash::xxhash) set(XXHASH_BUNDLED_MODE TRUE) set(XXHASH_BUILD_XXHSUM FALSE) add_subdirectory(External/xxhash/cmake_unofficial/) endif() add_compile_options(-Wno-trigraphs) add_compile_definitions(GLOBAL_DATA_DIRECTORY="${DATA_DIRECTORY}/") if (BUILD_TESTING) find_package(Catch2 3 QUIET) if (NOT Catch2_FOUND) add_subdirectory(External/Catch2/) # Pull in catch_discover_tests definition list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/External/Catch2/contrib/") endif() include(Catch) else () # Override any previously generated test list to avoid running stale test binaries file(GENERATE OUTPUT CTestTestfile.cmake CONTENT "# No tests since BUILD_TESTING is disabled") endif() find_package(fmt QUIET) if (NOT fmt_FOUND) # Disable fmt install set(FMT_INSTALL OFF) add_subdirectory(External/fmt/) endif() find_package(range-v3 QUIET) if (NOT range-v3_FOUND) add_subdirectory(External/range-v3/) target_compile_definitions(range-v3 INTERFACE RANGES_DISABLE_DEPRECATED_WARNINGS) endif() add_subdirectory(External/tiny-json/) include_directories(Source/) include_directories("${CMAKE_BINARY_DIR}/Source/") include(CheckCXXCompilerFlag) # Add in diagnostic colours if the option is available. # Ninja code generator will kill colours if this isn't here check_cxx_compiler_flag(-fdiagnostics-color=always GCC_COLOR) check_cxx_compiler_flag(-fcolor-diagnostics CLANG_COLOR) check_cxx_compiler_flag(-Wno-deprecated-enum-enum-conversion ENUM_ENUM_WARNING) if (GCC_COLOR) add_compile_options(-fdiagnostics-color=always) endif() if (CLANG_COLOR) add_compile_options(-fcolor-diagnostics) endif() if(ENUM_ENUM_WARNING) add_compile_options(-Wno-deprecated-enum-enum-conversion) endif() # GCC enables -Wchanges-meaning by default and treats some cases as an error if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") add_compile_options(-Wno-error=changes-meaning) endif() if(ENABLE_WERROR OR ENABLE_STRICT_WERROR) add_compile_options(-Werror) if (NOT ENABLE_STRICT_WERROR) # Disable some Werror that can add frustration when developing add_compile_options(-Wno-error=unused-variable) endif() endif() set(FEX_TUNE_COMPILE_FLAGS) if (NOT TUNE_ARCH STREQUAL "generic") check_cxx_compiler_flag("-march=${TUNE_ARCH}" COMPILER_SUPPORTS_ARCH_TYPE) if(COMPILER_SUPPORTS_ARCH_TYPE) list(APPEND FEX_TUNE_COMPILE_FLAGS "-march=${TUNE_ARCH}") else() message(FATAL_ERROR "Trying to compile arch type '${TUNE_ARCH}' but the compiler doesn't support this") endif() endif() if (TUNE_CPU STREQUAL "native") if(ARCHITECTURE_arm64) if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 999999.0) # Clang 12.0 fixed the -mcpu=native bug with mixed big.little implementers # Clang can not currently check for native Apple M1 type in hypervisor. Currently disabled check_cxx_compiler_flag("-mcpu=native" COMPILER_SUPPORTS_CPU_TYPE) if(COMPILER_SUPPORTS_CPU_TYPE) list(APPEND FEX_TUNE_COMPILE_FLAGS "-mcpu=native") endif() else() execute_process(COMMAND python3 "${PROJECT_SOURCE_DIR}/Scripts/aarch64_fit_native.py" "/proc/cpuinfo" "${CMAKE_CXX_COMPILER_VERSION}" OUTPUT_VARIABLE AARCH64_CPU) string(STRIP ${AARCH64_CPU} AARCH64_CPU) execute_process(COMMAND python3 "${PROJECT_SOURCE_DIR}/Scripts/NeedDisabledSVE.py" RESULT_VARIABLE NEEDS_SVE_DISABLED) if (NEEDS_SVE_DISABLED) message(STATUS "Platform has bugged SVE. Disabling") set(AARCH64_CPU "cortex-a78") endif() check_cxx_compiler_flag("-mcpu=${AARCH64_CPU}" COMPILER_SUPPORTS_CPU_TYPE) if(COMPILER_SUPPORTS_CPU_TYPE) list(APPEND FEX_TUNE_COMPILE_FLAGS "-mcpu=${AARCH64_CPU}") endif() endif() else() check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) if(COMPILER_SUPPORTS_MARCH_NATIVE) list(APPEND FEX_TUNE_COMPILE_FLAGS "-march=native") endif() endif() elseif (NOT TUNE_CPU STREQUAL "none") check_cxx_compiler_flag("-mcpu=${TUNE_CPU}" COMPILER_SUPPORTS_CPU_TYPE) if(COMPILER_SUPPORTS_CPU_TYPE) list(APPEND FEX_TUNE_COMPILE_FLAGS "-mcpu=${TUNE_CPU}") else() message(FATAL_ERROR "Trying to compile cpu type '${TUNE_CPU}' but the compiler doesn't support this") endif() endif() set(GIT_DESCRIBE_STRING "FEX-Unknown") if (OVERRIDE_VERSION STREQUAL "detect") find_package(Git) if (GIT_FOUND) execute_process( COMMAND ${GIT_EXECUTABLE} describe --abbrev=7 WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DESCRIBE_STRING ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) endif() else() set(GIT_DESCRIBE_STRING "${OVERRIDE_VERSION}") endif() set(GIT_HASH "Unknown") if (OVERRIDE_HASH STREQUAL "detect") find_package(Git) if (GIT_FOUND) execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse HEAD WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" OUTPUT_VARIABLE GIT_HASH ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) endif() else() set(GIT_HASH "${OVERRIDE_HASH}") endif() message(STATUS "FEX version: ${GIT_DESCRIBE_STRING}") message(STATUS "FEX commit: ${GIT_HASH}") # Prepends 0x to every two-character sequence in the hash, # OR the final character of the hash, to plumb it for C++ usage. e.g.: # -DOVERRIDE_HASH=123456aa => 0x12, 0x34, 0x56, 0xaa, # -DOVERRIDE_HASH=12345678a => 0x12, 0x34, 0x56, 0x78, 0xa, string(REGEX REPLACE "(..|.$)" "0x\\1, " GIT_HASH_ARRAY "${GIT_HASH}") if (ENABLE_IWYU) find_program(IWYU_EXE NAMES iwyu include-what-you-use) if (IWYU_EXE) message(STATUS "IWYU enabled") set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE "${IWYU_EXE}") endif() endif() add_compile_options(-Wall) if (BUILD_TESTING) message(STATUS "Unit tests are enabled") set(TEST_JOB_COUNT "" CACHE STRING "Override number of parallel jobs to use while running tests") if (TEST_JOB_COUNT) message(STATUS "Running tests with ${TEST_JOB_COUNT} jobs") elseif(CMAKE_VERSION VERSION_LESS "3.29") execute_process(COMMAND "nproc" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE TEST_JOB_COUNT) endif() set(TEST_JOB_FLAG "-j${TEST_JOB_COUNT}") endif() add_subdirectory(External/SoftFloat-3e/) add_subdirectory(External/cephes/) add_subdirectory(FEXHeaderUtils/) add_subdirectory(CodeEmitter/) add_subdirectory(FEXCore/) if (ARCHITECTURE_arm64 AND NOT MINGW AND NOT BUILD_STEAM_SUPPORT) # Binfmt_misc files must be installed prior to Source/ installs add_subdirectory(Data/binfmts/) endif() add_subdirectory(Source/) if (NOT BUILD_STEAM_SUPPORT) add_subdirectory(Data/AppConfig/) endif() # Install the ThunksDB file file(GLOB CONFIG_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/Data/*.json) # Any application configuration json file gets installed foreach(CONFIG_SRC ${CONFIG_SOURCES}) install(FILES ${CONFIG_SRC} DESTINATION ${DATA_DIRECTORY}/ COMPONENT Runtime) endforeach() if (BUILD_TESTING) add_subdirectory(unittests/) endif() if (BUILD_THUNKS) set(FEX_PROJECT_SOURCE_DIR ${PROJECT_SOURCE_DIR}) add_subdirectory(ThunkLibs/Generator) # Thunk targets for both host libraries and IDE integration add_subdirectory(ThunkLibs/HostLibs) # Thunk targets for IDE integration of guest code, only add_subdirectory(ThunkLibs/GuestLibs) # Thunk targets for guest libraries include(ExternalProject) ExternalProject_Add(guest-libs PREFIX guest-libs SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ThunkLibs/GuestLibs" BINARY_DIR "Guest" CMAKE_ARGS "-DBITNESS=64" "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" "-DBUILD_FEX_LINUX_TESTS=${BUILD_FEX_LINUX_TESTS}" "-DENABLE_CLANG_THUNKS=${ENABLE_CLANG_THUNKS}" "-DCMAKE_TOOLCHAIN_FILE:FILEPATH=${X86_64_TOOLCHAIN_FILE}" "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" "-DFEX_PROJECT_SOURCE_DIR=${FEX_PROJECT_SOURCE_DIR}" "-DGENERATOR_EXE=$" "-DX86_DEV_ROOTFS=${X86_DEV_ROOTFS}" INSTALL_COMMAND "" BUILD_ALWAYS ON DEPENDS thunkgen) ExternalProject_Add(guest-libs-32 PREFIX guest-libs-32 SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ThunkLibs/GuestLibs" BINARY_DIR "Guest_32" CMAKE_ARGS "-DBITNESS=32" "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" "-DBUILD_FEX_LINUX_TESTS=${BUILD_FEX_LINUX_TESTS}" "-DENABLE_CLANG_THUNKS=${ENABLE_CLANG_THUNKS}" "-DCMAKE_TOOLCHAIN_FILE:FILEPATH=${X86_32_TOOLCHAIN_FILE}" "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" "-DFEX_PROJECT_SOURCE_DIR=${FEX_PROJECT_SOURCE_DIR}" "-DGENERATOR_EXE=$" "-DX86_DEV_ROOTFS=${X86_DEV_ROOTFS}" INSTALL_COMMAND "" BUILD_ALWAYS ON DEPENDS thunkgen) install( CODE "message(\"-- Installing: guest-libs\")" CODE " execute_process(COMMAND ${CMAKE_COMMAND} --build . --target install WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/Guest)" DEPENDS guest-libs COMPONENT Runtime) install( CODE "message(\"-- Installing: guest-libs-32\")" CODE " execute_process(COMMAND ${CMAKE_COMMAND} --build . --target install WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/Guest_32)" DEPENDS guest-libs-32 COMPONENT Runtime) add_custom_target(uninstall_guest-libs COMMAND ${CMAKE_COMMAND} "--build" "." "--target" "uninstall" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/Guest) add_custom_target(uninstall_guest-libs-32 COMMAND ${CMAKE_COMMAND} "--build" "." "--target" "uninstall" WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/Guest_32) add_dependencies(uninstall uninstall_guest-libs) add_dependencies(uninstall uninstall_guest-libs-32) endif() if (NOT MINGW AND BUILD_STEAM_SUPPORT) add_subdirectory(Source/Steam/) endif() ================================================ FILE: CMakeSettings.json ================================================ { "environments": [ { "BuildPath": "${projectDir}\\out\\build\\${name}", "InstallPath": "${projectDir}\\out\\install\\${name}", "clangcl": "clang-cl.exe", "cc": "clang", "cxx": "clang++" } ], "configurations": [ { "name": "WSL-Clang-Debug", "generator": "Ninja", "configurationType": "Debug", "buildRoot": "${env.BuildPath}", "installRoot": "${env.InstallPath}", "cmakeExecutable": "/usr/bin/cmake", "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "", "wslPath": "${defaultWSLPath}", "inheritEnvironments": [ "linux_clang_x64" ], "addressSanitizerRuntimeFlags": "detect_leaks=0", "variables": [ { "name": "WSL", "value": "TRUE", "type": "BOOL" } ] }, { "name": "WSL-Clang-Release", "generator": "Ninja", "configurationType": "RelWithDebInfo", "buildRoot": "${env.BuildPath}", "installRoot": "${env.InstallPath}", "cmakeExecutable": "/usr/bin/cmake", "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "", "wslPath": "${defaultWSLPath}", "inheritEnvironments": [ "linux_clang_x64" ], "addressSanitizerRuntimeFlags": "detect_leaks=0", "variables": [ { "name": "WSL", "value": "TRUE", "type": "BOOL" } ] }, { "name": "x86-Clang-Cross-Debug", "generator": "Ninja", "configurationType": "Debug", "buildRoot": "${env.BuildPath}", "installRoot": "${env.InstallPath}", "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "", "inheritEnvironments": [ "clang_cl_x86" ], "variables": [ { "name": "CMAKE_C_COMPILER", "value": "${env.cc}", "type": "STRING" }, { "name": "CMAKE_CXX_COMPILER", "value": "${env.cxx}", "type": "STRING" }, { "name": "CMAKE_SYSROOT", "value": "${env.fexsysroot}", "type": "STRING" } ] }, { "name": "x64-Clang-Cross-Release", "generator": "Ninja", "configurationType": "RelWithDebInfo", "buildRoot": "${env.BuildPath}", "installRoot": "${env.InstallPath}", "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "", "inheritEnvironments": [ "clang_cl_x86" ], "variables": [ { "name": "CMAKE_C_COMPILER", "value": "${env.cc}", "type": "STRING" }, { "name": "CMAKE_CXX_COMPILER", "value": "${env.cxx}", "type": "STRING" }, { "name": "CMAKE_SYSROOT", "value": "${env.fexsysroot}", "type": "STRING" } ] }, { "name": "Linux-Clang-Remote-Debug", "generator": "Ninja", "configurationType": "Debug", "cmakeExecutable": "/usr/bin/cmake", "remoteCopySourcesExclusionList": [ ".vs", ".vscode", ".git", ".github", "build", "out", "bin" ], "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "", "inheritEnvironments": [ "linux_clang_x64" ], "remoteMachineName": "${env.fexremote}", "remoteCMakeListsRoot": "$HOME/projects/.vs/${projectDirName}/src", "remoteBuildRoot": "$HOME/projects/.vs/${projectDirName}/build/${name}", "remoteInstallRoot": "$HOME/projects/.vs/${projectDirName}/install/${name}", "remoteCopySources": true, "rsyncCommandArgs": "-t --delete --delete-excluded", "remoteCopyBuildOutput": false, "remoteCopySourcesMethod": "rsync", "addressSanitizerRuntimeFlags": "detect_leaks=0", "variables": [] } ] } ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at team@fex-emu.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: CodeEmitter/CMakeLists.txt ================================================ add_library(CodeEmitter INTERFACE) target_include_directories(CodeEmitter INTERFACE .) ================================================ FILE: CodeEmitter/CodeEmitter/ALUOps.inl ================================================ // SPDX-License-Identifier: MIT /* ALU instruction emitters. * * Almost all of these operations have `ARMEmitter::Size` as their first argument. * This allows both 32-bit and 64-bit selection of how that instruction is going to operate. * * Some emitter operations explicitly use `XRegister` or `WRegister`. * This is usually due to the instruction only supporting one operating size. * Although in some cases is a minor convenience without any performance implications. * * FEX-Emu ALU operations usually have a 32-bit or 64-bit operating size encoded in the IR operation, * This allows FEX to use a single helper function which decodes to both handlers. */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif private: static bool IsADRRange(int64_t Imm) { return Imm >= -1048576 && Imm <= 1048575; } static bool IsADRPRange(int64_t Imm) { return Imm >= -4294967296 && Imm <= 4294963200; } static bool IsADRPAligned(int64_t Imm) { return (Imm & 0xFFF) == 0; } public: // PC relative void adr(ARMEmitter::Register rd, uint32_t Imm) { constexpr uint32_t Op = 0b0001'0000 << 24; DataProcessing_PCRel_Imm(Op, rd, Imm); } [[nodiscard]] BranchEncodeSucceeded adr(ARMEmitter::Register rd, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (IsADRRange(Imm)) { constexpr uint32_t Op = 0b0001'0000 << 24; DataProcessing_PCRel_Imm(Op, rd, Imm); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded adr(ARMEmitter::Register rd, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::ADR}); constexpr uint32_t Op = 0b0001'0000 << 24; DataProcessing_PCRel_Imm(Op, rd, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded adr(ARMEmitter::Register rd, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return adr(rd, &Label->Backward); } else { return adr(rd, &Label->Forward); } } void adrp(ARMEmitter::Register rd, uint32_t Imm) { constexpr uint32_t Op = 0b1001'0000 << 24; DataProcessing_PCRel_Imm(Op, rd, Imm); } [[nodiscard]] BranchEncodeSucceeded adrp(ARMEmitter::Register rd, const BackwardLabel* Label) { int64_t Imm = reinterpret_cast(Label->Location) - (GetCursorAddress() & ~0xFFFLL); if (IsADRPRange(Imm) && IsADRPAligned(Imm)) { constexpr uint32_t Op = 0b1001'0000 << 24; DataProcessing_PCRel_Imm(Op, rd, Imm); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded adrp(ARMEmitter::Register rd, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::ADRP}); constexpr uint32_t Op = 0b1001'0000 << 24; DataProcessing_PCRel_Imm(Op, rd, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded adrp(ARMEmitter::Register rd, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return adrp(rd, &Label->Backward); } else { return adrp(rd, &Label->Forward); } } [[nodiscard]] BranchEncodeSucceeded LongAddressGen(ARMEmitter::Register rd, const BackwardLabel* Label) { const auto SLocation = reinterpret_cast(Label->Location); const auto ULocation = std::bit_cast(SLocation); const int64_t Imm = SLocation - (GetCursorAddress()); const auto UImm = std::bit_cast(Imm); if (IsADRRange(Imm)) { // If the range is in ADR range then we can just use ADR. return adr(rd, Label); } if (IsADRPRange(Imm)) { const int64_t ADRPImm = (SLocation & ~0xFFFLL) - (GetCursorAddress() & ~0xFFFLL); // If the range is in the ADRP range then we can use ADRP. const bool NeedsOffset = !IsADRPAligned(ULocation); const uint64_t AlignedOffset = ULocation & 0xFFFULL; // First emit ADRP adrp(rd, ADRPImm >> 12); if (NeedsOffset) { // Now even an add add(ARMEmitter::Size::i64Bit, rd, rd, AlignedOffset); } return BranchEncodeSucceeded::Success; } // Stinky path, we need to load the address as a sequence of movz+movk+movk movz(ARMEmitter::Size::i64Bit, rd, (UImm >> 32) & 0xFFFF, 32); movk(ARMEmitter::Size::i64Bit, rd, (UImm >> 16) & 0xFFFF, 16); movk(ARMEmitter::Size::i64Bit, rd, UImm & 0xFFFF); return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded LongAddressGen(ARMEmitter::Register rd, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::LONG_ADDRESS_GEN}); // Emit a register index and two nops. These will be backpatched. dc32(rd.Idx()); nop(); nop(); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded LongAddressGen(ARMEmitter::Register rd, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return LongAddressGen(rd, &Label->Backward); } else { return LongAddressGen(rd, &Label->Forward); } } // Add/subtract immediate void add(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t Imm, bool LSL12 = false) { constexpr uint32_t Op = 0b0001'0001'0 << 23; DataProcessing_AddSub_Imm(Op, s, rd, rn, Imm, LSL12); } void adds(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t Imm, bool LSL12 = false) { constexpr uint32_t Op = 0b0011'0001'0 << 23; DataProcessing_AddSub_Imm(Op, s, rd, rn, Imm, LSL12); } void cmn(ARMEmitter::Size s, ARMEmitter::Register rn, uint32_t Imm, bool LSL12 = false) { adds(s, ARMEmitter::Reg::zr, rn, Imm, LSL12); } void sub(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t Imm, bool LSL12 = false) { constexpr uint32_t Op = 0b0101'0001'0 << 23; DataProcessing_AddSub_Imm(Op, s, rd, rn, Imm, LSL12); } void cmp(ARMEmitter::Size s, ARMEmitter::Register rn, uint32_t Imm, bool LSL12 = false) { constexpr uint32_t Op = 0b0111'0001'0 << 23; DataProcessing_AddSub_Imm(Op, s, ARMEmitter::Reg::rsp, rn, Imm, LSL12); } void subs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t Imm, bool LSL12 = false) { constexpr uint32_t Op = 0b0111'0001'0 << 23; DataProcessing_AddSub_Imm(Op, s, rd, rn, Imm, LSL12); } // Min/max immediate void smax(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, int64_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -128 && Imm <= 127, "{} Immediate too large", __func__); MinMaxImmediate(0b0000, s, rd, rn, Imm); } void umax(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { LOGMAN_THROW_A_FMT(Imm <= 255, "{} Immediate too large", __func__); MinMaxImmediate(0b0001, s, rd, rn, Imm); } void smin(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, int64_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -128 && Imm <= 127, "{} Immediate too large", __func__); MinMaxImmediate(0b0010, s, rd, rn, Imm); } void umin(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { LOGMAN_THROW_A_FMT(Imm <= 255, "{} Immediate too large", __func__); MinMaxImmediate(0b0011, s, rd, rn, Imm); } // Logical immediate void and_(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { uint32_t n, immr, imms; const auto IsImm = IsImmLogical(Imm, RegSizeInBits(s), &n, &imms, &immr); LOGMAN_THROW_A_FMT(IsImm, "Couldn't encode immediate to logical op"); and_(s, rd, rn, n, immr, imms); } void bic(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { and_(s, rd, rn, ~Imm); } void ands(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { uint32_t n, immr, imms; const auto IsImm = IsImmLogical(Imm, RegSizeInBits(s), &n, &imms, &immr); LOGMAN_THROW_A_FMT(IsImm, "Couldn't encode immediate to logical op"); ands(s, rd, rn, n, immr, imms); } void bics(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { ands(s, rd, rn, ~Imm); } void orr(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { uint32_t n, immr, imms; const auto IsImm = IsImmLogical(Imm, RegSizeInBits(s), &n, &imms, &immr); LOGMAN_THROW_A_FMT(IsImm, "Couldn't encode immediate to logical op"); orr(s, rd, rn, n, immr, imms); } void eor(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { uint32_t n, immr, imms; const auto IsImm = IsImmLogical(Imm, RegSizeInBits(s), &n, &imms, &immr); LOGMAN_THROW_A_FMT(IsImm, "Couldn't encode immediate to logical op"); eor(s, rd, rn, n, immr, imms); } void tst(ARMEmitter::Size s, Register rn, uint64_t imm) { ands(s, Reg::zr, rn, imm); } // Move wide immediate void movn(ARMEmitter::Size s, ARMEmitter::Register rd, uint32_t Imm, uint32_t Offset = 0) { LOGMAN_THROW_A_FMT((Imm & 0xFFFF0000U) == 0, "Upper bits of move wide not valid"); LOGMAN_THROW_A_FMT((Offset % 16) == 0, "Offset must be 16bit aligned"); constexpr uint32_t Op = 0b001'0010'100 << 21; DataProcessing_MoveWide(Op, s, rd, Imm, Offset >> 4); } void mov(ARMEmitter::Size s, ARMEmitter::Register rd, uint32_t Imm) { movz(s, rd, Imm, 0); } void mov(ARMEmitter::XRegister rd, uint32_t Imm) { movz(ARMEmitter::Size::i64Bit, rd.R(), Imm, 0); } void mov(ARMEmitter::WRegister rd, uint32_t Imm) { movz(ARMEmitter::Size::i32Bit, rd.R(), Imm, 0); } void movz(ARMEmitter::Size s, ARMEmitter::Register rd, uint32_t Imm, uint32_t Offset = 0) { LOGMAN_THROW_A_FMT((Imm & 0xFFFF0000U) == 0, "Upper bits of move wide not valid"); LOGMAN_THROW_A_FMT((Offset % 16) == 0, "Offset must be 16bit aligned"); constexpr uint32_t Op = 0b101'0010'100 << 21; DataProcessing_MoveWide(Op, s, rd, Imm, Offset >> 4); } void movk(ARMEmitter::Size s, ARMEmitter::Register rd, uint32_t Imm, uint32_t Offset = 0) { LOGMAN_THROW_A_FMT((Imm & 0xFFFF0000U) == 0, "Upper bits of move wide not valid"); LOGMAN_THROW_A_FMT((Offset % 16) == 0, "Offset must be 16bit aligned"); constexpr uint32_t Op = 0b111'0010'100 << 21; DataProcessing_MoveWide(Op, s, rd, Imm, Offset >> 4); } void movn(ARMEmitter::XRegister rd, uint32_t Imm, uint32_t Offset = 0) { movn(ARMEmitter::Size::i64Bit, rd.R(), Imm, Offset); } void movz(ARMEmitter::XRegister rd, uint32_t Imm, uint32_t Offset = 0) { movz(ARMEmitter::Size::i64Bit, rd.R(), Imm, Offset); } void movk(ARMEmitter::XRegister rd, uint32_t Imm, uint32_t Offset = 0) { movk(ARMEmitter::Size::i64Bit, rd.R(), Imm, Offset); } void movn(ARMEmitter::WRegister rd, uint32_t Imm, uint32_t Offset = 0) { movn(ARMEmitter::Size::i32Bit, rd.R(), Imm, Offset); } void movz(ARMEmitter::WRegister rd, uint32_t Imm, uint32_t Offset = 0) { movz(ARMEmitter::Size::i32Bit, rd.R(), Imm, Offset); } void movk(ARMEmitter::WRegister rd, uint32_t Imm, uint32_t Offset = 0) { movk(ARMEmitter::Size::i32Bit, rd.R(), Imm, Offset); } // Bitfield void sxtb(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { sbfm(s, rd, rn, 0, 7); } void sxth(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { sbfm(s, rd, rn, 0, 15); } void sxtw(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn) { sbfm(ARMEmitter::Size::i64Bit, rd, rn.X(), 0, 31); } void sbfx(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t lsb, uint32_t width) { LOGMAN_THROW_A_FMT(width > 0, "sbfx needs width > 0"); LOGMAN_THROW_A_FMT((lsb + width) <= RegSizeInBits(s), "Tried to sbfx a region larger than the register"); sbfm(s, rd, rn, lsb, lsb + width - 1); } void sbfiz(ARMEmitter::Size s, Register rd, Register rn, uint32_t lsb, uint32_t width) { xbfiz_helper(true, s, rd, rn, lsb, width); } void asr(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t shift) { const auto RegSize_m1 = RegSizeInBits(s) - 1; shift &= RegSize_m1; sbfm(s, rd, rn, shift, RegSize_m1); } void uxtb(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { ubfm(s, rd, rn, 0, 7); } void uxth(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { ubfm(s, rd, rn, 0, 15); } void uxtw(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { ubfm(s, rd, rn, 0, 31); } void ubfm(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b0101'0011'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, s == ARMEmitter::Size::i64Bit, immr, imms); } void ubfiz(ARMEmitter::Size s, Register rd, Register rn, uint32_t lsb, uint32_t width) { xbfiz_helper(false, s, rd, rn, lsb, width); } void lsl(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t shift) { const auto RegSize_m1 = RegSizeInBits(s) - 1; shift &= RegSize_m1; ubfm(s, rd, rn, (RegSizeInBits(s) - shift) & RegSize_m1, RegSize_m1 - shift); } void lsr(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t shift) { const auto RegSize_m1 = RegSizeInBits(s) - 1; shift &= RegSize_m1; ubfm(s, rd, rn, shift, RegSize_m1); } void ubfx(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t lsb, uint32_t width) { LOGMAN_THROW_A_FMT(width > 0, "ubfx needs width > 0"); LOGMAN_THROW_A_FMT((lsb + width) <= RegSizeInBits(s), "Tried to ubfx a region larger than the register"); ubfm(s, rd, rn, lsb, lsb + width - 1); } void bfi(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t lsb, uint32_t width) { const auto RegSize = RegSizeInBits(s); LOGMAN_THROW_A_FMT(width > 0, "bfc/bfi needs width > 0"); LOGMAN_THROW_A_FMT((lsb + width) <= RegSize, "Tried to bfc/bfi a region larger than the register"); bfm(s, rd, rn, (RegSize - lsb) & (RegSize - 1), width - 1); } void bfc(ARMEmitter::Size s, Register rd, uint32_t lsb, uint32_t width) { bfi(s, rd, Reg::zr, lsb, width); } void bfxil(ARMEmitter::Size s, Register rd, Register rn, uint32_t lsb, uint32_t width) { const auto reg_size_bits = RegSizeInBits(s); const auto lsb_p_width = lsb + width; LOGMAN_THROW_A_FMT(width >= 1, "bfxil needs width >= 1"); LOGMAN_THROW_A_FMT(lsb_p_width <= reg_size_bits, "bfxil lsb + width ({}) must be <= {}. lsb={}, width={}", lsb_p_width, reg_size_bits, lsb, width); bfm(s, rd, rn, lsb, lsb_p_width - 1); } // Extract void extr(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, uint32_t Imm) { constexpr uint32_t Op = 0b001'0011'100 << 21; LOGMAN_THROW_A_FMT(Imm < RegSizeInBits(s), "Tried to extr a region larger than the register"); DataProcessing_Extract(Op, s, rd, rn, rm, Imm); } void ror(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t Imm) { Imm &= RegSizeInBits(s) - 1; extr(s, rd, rn, rn, Imm); } // Data processing - 2 source void udiv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0000'10U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void sdiv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0000'11U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void lslv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0010'00U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void lsrv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0010'01U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void asrv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0010'10U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void rorv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0010'11U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void crc32b(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0100'00U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i32Bit, rd, rn, rm); } void crc32h(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0100'01U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i32Bit, rd, rn, rm); } void crc32w(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0100'10U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i32Bit, rd, rn, rm); } void crc32cb(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0101'00U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i32Bit, rd, rn, rm); } void crc32ch(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0101'01U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i32Bit, rd, rn, rm); } void crc32cw(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0101'10U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i32Bit, rd, rn, rm); } void smax(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0110'00U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void umax(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0110'01U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void smin(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0110'10U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void umin(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0110'11U << 10); DataProcessing_2Source(Op, s, rd, rn, rm); } void subp(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0000'00U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } void irg(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0001'00U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } void gmi(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0001'01U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } void pacga(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0011'00U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } void crc32x(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0100'11U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } void crc32cx(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b001'1010'110U << 21) | (0b0101'11U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } void subps(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = (0b011'1010'110U << 21) | (0b0000'00U << 10); DataProcessing_2Source(Op, ARMEmitter::Size::i64Bit, rd, rn, rm); } // Data processing - 1 source void rbit(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0000'00U << 10); DataProcessing_1Source(Op, s, rd, rn); } void rev16(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0000'01U << 10); DataProcessing_1Source(Op, s, rd, rn); } void rev(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0000'10U << 10); DataProcessing_1Source(Op, ARMEmitter::Size::i32Bit, rd, rn); } void rev32(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0000'10U << 10); DataProcessing_1Source(Op, ARMEmitter::Size::i64Bit, rd, rn); } void clz(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0001'00U << 10); DataProcessing_1Source(Op, s, rd, rn); } void cls(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0001'01U << 10); DataProcessing_1Source(Op, s, rd, rn); } void rev(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0000'11U << 10); DataProcessing_1Source(Op, ARMEmitter::Size::i64Bit, rd, rn); } void rev(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0000'10U << 10) | (s == ARMEmitter::Size::i64Bit ? (1U << 10) : 0); DataProcessing_1Source(Op, s, rd, rn); } void ctz(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0001'10U << 10); DataProcessing_1Source(Op, s, rd, rn); } void cnt(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0001'11U << 10); DataProcessing_1Source(Op, s, rd, rn); } void abs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { constexpr uint32_t Op = (0b101'1010'110U << 21) | (0b0'0000U << 16) | (0b0010'00U << 10); DataProcessing_1Source(Op, s, rd, rn); } // TODO: PAUTH // Logical - shifted register void mov(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn) { orr(s, rd, ARMEmitter::Reg::zr, rn, ARMEmitter::ShiftType::LSL, 0); } void mov(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn) { orr(ARMEmitter::Size::i64Bit, rd.R(), ARMEmitter::Reg::zr, rn.R(), ARMEmitter::ShiftType::LSL, 0); } void mov(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn) { orr(ARMEmitter::Size::i32Bit, rd.R(), ARMEmitter::Reg::zr, rn.R(), ARMEmitter::ShiftType::LSL, 0); } void mvn(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { orn(s, rd, ARMEmitter::Reg::zr, rn, Shift, amt); } void and_(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b000'1010'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void ands(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b110'1010'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void bic(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b000'1010'001U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void bics(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b110'1010'001U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void orr(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b010'1010'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void tst(ARMEmitter::Size s, Register rn, Register rm, ShiftType shift = ShiftType::LSL, uint32_t amt = 0) { ands(s, Reg::zr, rn, rm, shift, amt); } void orn(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b010'1010'001U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void eor(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b100'1010'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void eon(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { constexpr uint32_t Op = 0b100'1010'001U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } // AddSub - shifted register void add(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { add(ARMEmitter::Size::i64Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void adds(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { adds(ARMEmitter::Size::i64Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void cmn(ARMEmitter::XRegister rn, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { adds(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::zr, rn.R(), rm.R(), Shift, amt); } void sub(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { sub(ARMEmitter::Size::i64Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void neg(ARMEmitter::XRegister rd, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { sub(rd, ARMEmitter::XReg::zr, rm, Shift, amt); } void cmp(ARMEmitter::XRegister rn, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, rn.R(), rm.R(), Shift, amt); } void subs(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(ARMEmitter::Size::i64Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void negs(ARMEmitter::XRegister rd, ARMEmitter::XRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(rd, ARMEmitter::XReg::zr, rm, Shift, amt); } void add(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { add(ARMEmitter::Size::i32Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void adds(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { adds(ARMEmitter::Size::i32Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void cmn(ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { adds(ARMEmitter::Size::i32Bit, ARMEmitter::WReg::zr, rn.R(), rm.R(), Shift, amt); } void sub(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { sub(ARMEmitter::Size::i32Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void neg(ARMEmitter::WRegister rd, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { sub(rd, ARMEmitter::WReg::zr, rm, Shift, amt); } void cmp(ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::rsp, rn.R(), rm.R(), Shift, amt); } void subs(ARMEmitter::WRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(ARMEmitter::Size::i32Bit, rd.R(), rn.R(), rm.R(), Shift, amt); } void negs(ARMEmitter::WRegister rd, ARMEmitter::WRegister rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(rd, ARMEmitter::WReg::zr, rm, Shift, amt); } void add(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { LOGMAN_THROW_A_FMT(Shift != ARMEmitter::ShiftType::ROR, "Doesn't support ROR"); constexpr uint32_t Op = 0b000'1011'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void adds(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { LOGMAN_THROW_A_FMT(Shift != ARMEmitter::ShiftType::ROR, "Doesn't support ROR"); constexpr uint32_t Op = 0b010'1011'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void cmn(ARMEmitter::Size s, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { adds(s, ARMEmitter::Reg::zr, rn, rm, Shift, amt); } void sub(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { LOGMAN_THROW_A_FMT(Shift != ARMEmitter::ShiftType::ROR, "Doesn't support ROR"); constexpr uint32_t Op = 0b100'1011'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void neg(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { sub(s, rd, ARMEmitter::Reg::zr, rm, Shift, amt); } void cmp(ARMEmitter::Size s, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(s, ARMEmitter::Reg::zr, rn, rm, Shift, amt); } void subs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { LOGMAN_THROW_A_FMT(Shift != ARMEmitter::ShiftType::ROR, "Doesn't support ROR"); constexpr uint32_t Op = 0b110'1011'000U << 21; DataProcessing_Shifted_Reg(Op, s, rd, rn, rm, Shift, amt); } void negs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift = ARMEmitter::ShiftType::LSL, uint32_t amt = 0) { subs(s, rd, ARMEmitter::Reg::zr, rm, Shift, amt); } // AddSub - extended register void add(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift = 0) { LOGMAN_THROW_A_FMT(Shift <= 4, "Shift amount is too large"); constexpr uint32_t Op = 0b000'1011'001U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, Option, Shift); } void adds(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift = 0) { constexpr uint32_t Op = 0b010'1011'001U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, Option, Shift); } void cmn(ARMEmitter::Size s, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift = 0) { adds(s, ARMEmitter::Reg::zr, rn, rm, Option, Shift); } void sub(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift = 0) { constexpr uint32_t Op = 0b100'1011'001U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, Option, Shift); } void subs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift = 0) { constexpr uint32_t Op = 0b110'1011'001U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, Option, Shift); } void cmp(ARMEmitter::Size s, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift = 0) { constexpr uint32_t Op = 0b110'1011'001U << 21; DataProcessing_Extended_Reg(Op, s, ARMEmitter::Reg::zr, rn, rm, Option, Shift); } // AddSub - with carry void adc(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = 0b0001'1010'000U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, ARMEmitter::ExtendedType::UXTB, 0); } void adcs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = 0b0011'1010'000U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, ARMEmitter::ExtendedType::UXTB, 0); } void sbc(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = 0b0101'1010'000U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, ARMEmitter::ExtendedType::UXTB, 0); } void sbcs(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Op = 0b0111'1010'000U << 21; DataProcessing_Extended_Reg(Op, s, rd, rn, rm, ARMEmitter::ExtendedType::UXTB, 0); } void ngc(ARMEmitter::Size s, Register rd, Register rm) { sbc(s, rd, Reg::zr, rm); } void ngcs(ARMEmitter::Size s, Register rd, Register rm) { sbcs(s, rd, Reg::zr, rm); } // Rotate right into flags void rmif(XRegister rn, uint32_t shift, uint32_t mask) { LOGMAN_THROW_A_FMT(shift <= 63, "Shift must be within 0-63. Shift: {}", shift); LOGMAN_THROW_A_FMT(mask <= 15, "Mask must be within 0-15. Mask: {}", mask); uint32_t Op = 0b1011'1010'0000'0000'0000'0100'0000'0000; Op |= rn.Idx() << 5; Op |= shift << 15; Op |= mask; dc32(Op); } // Evaluate into flags void setf8(WRegister rn) { constexpr uint32_t Op = 0b0011'1010'0000'0000'0000'1000'0000'1101; EvaluateIntoFlags(Op, 0, rn); } void setf16(WRegister rn) { constexpr uint32_t Op = 0b0011'1010'0000'0000'0000'1000'0000'1101; EvaluateIntoFlags(Op, 1, rn); } void cfinv() { constexpr uint32_t Op = 0b1101'0101'0000'0000'0100'0000'0001'1111; dc32(Op); } void axflag() { constexpr uint32_t Op = 0b1101'0101'0000'0000'0100'0000'0101'1111; dc32(Op); } void xaflag() { constexpr uint32_t Op = 0b1101'0101'0000'0000'0100'0000'0011'1111; dc32(Op); } // Conditional compare - register void ccmn(ARMEmitter::Size s, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::StatusFlags flags, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0011'1010'010 << 21; ConditionalCompare(Op, 0, 0b00, 0, s, rn, rm, flags, Cond); } void ccmp(ARMEmitter::Size s, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::StatusFlags flags, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0011'1010'010 << 21; ConditionalCompare(Op, 1, 0b00, 0, s, rn, rm, flags, Cond); } // Conditional compare - immediate void ccmn(ARMEmitter::Size s, ARMEmitter::Register rn, uint32_t rm, ARMEmitter::StatusFlags flags, ARMEmitter::Condition Cond) { LOGMAN_THROW_A_FMT((rm & ~0b1'1111) == 0, "Comparison imm too large"); constexpr uint32_t Op = 0b0011'1010'010 << 21; ConditionalCompare(Op, 0, 0b10, 0, s, rn, rm, flags, Cond); } void ccmp(ARMEmitter::Size s, ARMEmitter::Register rn, uint32_t rm, ARMEmitter::StatusFlags flags, ARMEmitter::Condition Cond) { LOGMAN_THROW_A_FMT((rm & ~0b1'1111) == 0, "Comparison imm too large"); constexpr uint32_t Op = 0b0011'1010'010 << 21; ConditionalCompare(Op, 1, 0b10, 0, s, rn, rm, flags, Cond); } // Conditional select void csel(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0001'1010'100 << 21; ConditionalCompare(Op, 0, 0b00, s, rd, rn, rm, Cond); } void cset(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0001'1010'100 << 21; ConditionalCompare(Op, 0, 0b01, s, rd, ARMEmitter::Reg::zr, ARMEmitter::Reg::zr, static_cast(FEXCore::ToUnderlying(Cond) ^ FEXCore::ToUnderlying(ARMEmitter::Condition::CC_NE))); } void csinc(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0001'1010'100 << 21; ConditionalCompare(Op, 0, 0b01, s, rd, rn, rm, Cond); } void csinv(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0001'1010'100 << 21; ConditionalCompare(Op, 1, 0b00, s, rd, rn, rm, Cond); } void csneg(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Condition Cond) { constexpr uint32_t Op = 0b0001'1010'100 << 21; ConditionalCompare(Op, 1, 0b01, s, rd, rn, rm, Cond); } void cneg(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Condition Cond) { csneg(s, rd, rn, rn, InvertCondition(Cond)); } void cinc(ARMEmitter::Size s, Register rd, Register rn, Condition cond) { csinc(s, rd, rn, rn, InvertCondition(cond)); } void cinv(ARMEmitter::Size s, Register rd, Register rn, Condition cond) { csinv(s, rd, rn, rn, InvertCondition(cond)); } void csetm(ARMEmitter::Size s, Register rd, Condition cond) { csinv(s, rd, Reg::zr, Reg::zr, InvertCondition(cond)); } // Data processing - 3 source void madd(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Register ra) { constexpr uint32_t Op = 0b001'1011'000U << 21; DataProcessing_3Source(Op, 0, s, rd, rn, rm, ra); } void mul(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { madd(s, rd, rn, rm, XReg::zr); } void msub(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Register ra) { constexpr uint32_t Op = 0b001'1011'000U << 21; DataProcessing_3Source(Op, 1, s, rd, rn, rm, ra); } void mneg(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { msub(s, rd, rn, rm, XReg::zr); } void smaddl(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::XRegister ra) { constexpr uint32_t Op = 0b001'1011'001U << 21; DataProcessing_3Source(Op, 0, ARMEmitter::Size::i64Bit, rd, rn, rm, ra); } void smull(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { smaddl(rd, rn, rm, XReg::zr); } void smsubl(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::XRegister ra) { constexpr uint32_t Op = 0b001'1011'001U << 21; DataProcessing_3Source(Op, 1, ARMEmitter::Size::i64Bit, rd, rn, rm, ra); } void smnegl(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { smsubl(rd, rn, rm, XReg::zr); } void smulh(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = 0b001'1011'010U << 21; DataProcessing_3Source(Op, 0, ARMEmitter::Size::i64Bit, rd, rn, rm, ARMEmitter::Reg::zr); } void umaddl(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::XRegister ra) { constexpr uint32_t Op = 0b001'1011'101U << 21; DataProcessing_3Source(Op, 0, ARMEmitter::Size::i64Bit, rd, rn, rm, ra); } void umull(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { umaddl(rd, rn, rm, XReg::zr); } void umsubl(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm, ARMEmitter::XRegister ra) { constexpr uint32_t Op = 0b001'1011'101U << 21; DataProcessing_3Source(Op, 1, ARMEmitter::Size::i64Bit, rd, rn, rm, ra); } void umnegl(ARMEmitter::XRegister rd, ARMEmitter::WRegister rn, ARMEmitter::WRegister rm) { umsubl(rd, rn, rm, XReg::zr); } void umulh(ARMEmitter::XRegister rd, ARMEmitter::XRegister rn, ARMEmitter::XRegister rm) { constexpr uint32_t Op = 0b001'1011'110U << 21; DataProcessing_3Source(Op, 0, ARMEmitter::Size::i64Bit, rd, rn, rm, ARMEmitter::Reg::zr); } private: void and_(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t n, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b001'0010'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, n, immr, imms); } void ands(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t n, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b111'0010'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, n, immr, imms); } void orr(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t n, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b011'0010'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, n, immr, imms); } void eor(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t n, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b101'0010'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, n, immr, imms); } void sbfm(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b0001'0011'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, s == ARMEmitter::Size::i64Bit, immr, imms); } void bfm(ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t immr, uint32_t imms) { constexpr uint32_t Op = 0b0011'0011'00 << 22; DataProcessing_Logical_Imm(Op, s, rd, rn, s == ARMEmitter::Size::i64Bit, immr, imms); } // 4.1.64 - Data processing - Immediate void DataProcessing_PCRel_Imm(uint32_t Op, ARMEmitter::Register rd, uint32_t Imm) { // Ensure the immediate is masked. Imm &= 0b1'1111'1111'1111'1111'1111U; uint32_t Instr = Op; Instr |= (Imm & 0b11) << 29; Instr |= (Imm >> 2) << 5; Instr |= Encode_rd(rd); dc32(Instr); } void DataProcessing_AddSub_Imm(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t Imm, bool LSL12) { bool TooLarge = (Imm & ~0b1111'1111'1111U) != 0; if (TooLarge && !LSL12 && ((Imm >> 12) & ~0b1111'1111'1111U) == 0) { // We can convert an immediate TooLarge = false; LSL12 = true; Imm >>= 12; } LOGMAN_THROW_A_FMT(TooLarge == false, "Imm amount too large: 0x{:x}", Imm); const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= LSL12 << 22; Instr |= Imm << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Min/max immediate void MinMaxImmediate(uint32_t opc, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint64_t Imm) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = 0b1'0001'11U << 22; Instr |= SF; Instr |= opc << 18; Instr |= (Imm & 0xFF) << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Move Wide void DataProcessing_MoveWide(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, uint32_t Imm, uint32_t Offset) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= Imm << 5; Instr |= Offset << 21; Instr |= Encode_rd(rd); dc32(Instr); } // Logical immediate void DataProcessing_Logical_Imm(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, uint32_t n, uint32_t immr, uint32_t imms) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= n << 22; Instr |= immr << 16; Instr |= imms << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } void xbfiz_helper(bool is_signed, ARMEmitter::Size s, Register rd, Register rn, uint32_t lsb, uint32_t width) { const auto lsb_p_width = lsb + width; const auto reg_size_bits = RegSizeInBits(s); LOGMAN_THROW_A_FMT(lsb_p_width <= reg_size_bits, "lsb + width ({}) must be <= {}. lsb={}, width={}", lsb_p_width, reg_size_bits, lsb, width); LOGMAN_THROW_A_FMT(width >= 1, "xbfiz width must be >= 1"); const auto immr = (reg_size_bits - lsb) & (reg_size_bits - 1); const auto imms = width - 1; if (is_signed) { sbfm(s, rd, rn, immr, imms); } else { ubfm(s, rd, rn, immr, imms); } } void DataProcessing_Extract(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, uint32_t Imm) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; // Current ARMv8 spec hardcodes SF == N for this class of instructions. // Anythign else is undefined behaviour. const uint32_t N = s == ARMEmitter::Size::i64Bit ? (1U << 22) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= N; Instr |= Encode_rm(rm); Instr |= Imm << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Data-processing - 2 source void DataProcessing_2Source(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= Encode_rm(rm); Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Data processing - 1 source template void DataProcessing_1Source(uint32_t Op, ARMEmitter::Size s, T rd, T rn) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // AddSub - shifted register void DataProcessing_Shifted_Reg(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ShiftType Shift, uint32_t amt) { LOGMAN_THROW_A_FMT((amt & ~0b11'1111U) == 0, "Shift amount too large"); if (s == ARMEmitter::Size::i32Bit) { LOGMAN_THROW_A_FMT(amt < 32, "Shift amount for 32-bit must be below 32"); } const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= FEXCore::ToUnderlying(Shift) << 22; Instr |= Encode_rm(rm); Instr |= static_cast(amt) << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // AddSub - extended register void DataProcessing_Extended_Reg(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= Encode_rm(rm); Instr |= FEXCore::ToUnderlying(Option) << 13; Instr |= static_cast(Shift) << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Conditional compare - register template void ConditionalCompare(uint32_t Op, uint32_t o1, uint32_t o2, uint32_t o3, ARMEmitter::Size s, ARMEmitter::Register rn, T rm, ARMEmitter::StatusFlags flags, ARMEmitter::Condition Cond) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= o1 << 30; Instr |= Encode_rm(rm); Instr |= FEXCore::ToUnderlying(Cond) << 12; Instr |= o2 << 10; Instr |= Encode_rn(rn); Instr |= o3 << 4; Instr |= FEXCore::ToUnderlying(flags); dc32(Instr); } template void ConditionalCompare(uint32_t Op, uint32_t o1, uint32_t o2, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, T rm, ARMEmitter::Condition Cond) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= o1 << 30; Instr |= Encode_rm(rm); Instr |= FEXCore::ToUnderlying(Cond) << 12; Instr |= o2 << 10; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Data-processing - 3 source void DataProcessing_3Source(uint32_t Op, uint32_t Op0, ARMEmitter::Size s, ARMEmitter::Register rd, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::Register ra) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= Encode_rm(rm); Instr |= Op0 << 15; Instr |= Encode_ra(ra); Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } void EvaluateIntoFlags(uint32_t op, uint32_t size, WRegister rn) { uint32_t Instr = op; Instr |= size << 14; Instr |= rn.Idx() << 5; dc32(Instr); } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/ASIMDOps.inl ================================================ // SPDX-License-Identifier: MIT /* ASIMD instruction emitters. * * This contains emitters for vector operations explicitly. * Most instructions have a `SubRegSize` as their first argument to select element size while operating. * Additionally most emitters accept templated vector register arguments of both `QRegister` and `DRegister` types. * Based on the combination of those two arguments, it will emit an instruction operating on a 64-bit or 128-bit wide register * with the selected element size. * * Some vector operations are unsized and only operate at the one width. In these cases the instruction only * operates at one size, the width depends on the instruction. * The arguments for these instructions are usually `VRegister` but might be one of the other sized types as well. * * Only two instructions support the `i128Bit` ElementSize. */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif public: // Data Processing -- Scalar Floating-Point and Advanced SIMD // Cryptographic AES void aese(VRegister rd, VRegister rn) { CryptoAES(0b00100, rd, rn); } void aesd(VRegister rd, VRegister rn) { CryptoAES(0b00101, rd, rn); } void aesmc(VRegister rd, VRegister rn) { CryptoAES(0b00110, rd, rn); } void aesimc(VRegister rd, VRegister rn) { CryptoAES(0b00111, rd, rn); } // Cryptographic three-register SHA void sha1c(VRegister rd, SRegister rn, VRegister rm) { Crypto3RegSHA(0b000, rd, rn.V(), rm); } void sha1p(VRegister rd, SRegister rn, VRegister rm) { Crypto3RegSHA(0b001, rd, rn.V(), rm); } void sha1m(VRegister rd, SRegister rn, VRegister rm) { Crypto3RegSHA(0b010, rd, rn.V(), rm); } void sha1su0(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA(0b011, rd, rn, rm); } void sha256h(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA(0b100, rd, rn, rm); } void sha256h2(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA(0b101, rd, rn, rm); } void sha256su1(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA(0b110, rd, rn, rm); } // Cryptographic two-register SHA void sha1h(SRegister rd, SRegister rn) { Crypto2RegSHA(0b00000, rd.V(), rn.V()); } void sha1su1(VRegister rd, VRegister rn) { Crypto2RegSHA(0b00001, rd, rn); } void sha256su0(VRegister rd, VRegister rn) { Crypto2RegSHA(0b00010, rd, rn); } // Advanced SIMD table lookup void tbl(QRegister rd, QRegister rn, QRegister rm) { ASIMDTable(1, 0b00, 0b00, 0b0, rd.V(), rn.V(), rm.V()); } void tbl(DRegister rd, QRegister rn, DRegister rm) { ASIMDTable(0, 0b00, 0b00, 0b0, rd.V(), rn.V(), rm.V()); } void tbx(QRegister rd, QRegister rn, QRegister rm) { ASIMDTable(1, 0b00, 0b00, 0b1, rd.V(), rn.V(), rm.V()); } void tbx(DRegister rd, QRegister rn, DRegister rm) { ASIMDTable(0, 0b00, 0b00, 0b1, rd.V(), rn.V(), rm.V()); } void tbl(QRegister rd, QRegister rn, QRegister rn2, QRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2), "rn and rn2 must be sequential"); ASIMDTable(1, 0b00, 0b01, 0b0, rd.V(), rn.V(), rm.V()); } void tbl(DRegister rd, QRegister rn, QRegister rn2, DRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2), "rn and rn2 must be sequential"); ASIMDTable(0, 0b00, 0b01, 0b0, rd.V(), rn.V(), rm.V()); } void tbx(QRegister rd, QRegister rn, QRegister rn2, QRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2), "rn and rn2 must be sequential"); ASIMDTable(1, 0b00, 0b01, 0b1, rd.V(), rn.V(), rm.V()); } void tbx(DRegister rd, QRegister rn, QRegister rn2, DRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2), "rn and rn2 must be sequential"); ASIMDTable(0, 0b00, 0b01, 0b1, rd.V(), rn.V(), rm.V()); } void tbl(QRegister rd, QRegister rn, QRegister rn2, QRegister rn3, QRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3), "rn, rn2, and rn3 must be sequential"); ASIMDTable(1, 0b00, 0b10, 0b0, rd.V(), rn.V(), rm.V()); } void tbl(DRegister rd, QRegister rn, QRegister rn2, QRegister rn3, DRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3), "rn, rn2, and rn3 must be sequential"); ASIMDTable(0, 0b00, 0b10, 0b0, rd.V(), rn.V(), rm.V()); } void tbx(QRegister rd, QRegister rn, QRegister rn2, QRegister rn3, QRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3), "rn, rn2, and rn3 must be sequential"); ASIMDTable(1, 0b00, 0b10, 0b1, rd.V(), rn.V(), rm.V()); } void tbx(DRegister rd, QRegister rn, QRegister rn2, QRegister rn3, DRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3), "rn, rn2, and rn3 must be sequential"); ASIMDTable(0, 0b00, 0b10, 0b1, rd.V(), rn.V(), rm.V()); } void tbl(QRegister rd, QRegister rn, QRegister rn2, QRegister rn3, QRegister rn4, QRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3, rn4), "rn, rn2, rn3, and rn4 must be sequential"); ASIMDTable(1, 0b00, 0b11, 0b0, rd.V(), rn.V(), rm.V()); } void tbl(DRegister rd, QRegister rn, QRegister rn2, QRegister rn3, QRegister rn4, DRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3, rn4), "rn, rn2, rn3, and rn4 must be sequential"); ASIMDTable(0, 0b00, 0b11, 0b0, rd.V(), rn.V(), rm.V()); } void tbx(QRegister rd, QRegister rn, QRegister rn2, QRegister rn3, QRegister rn4, QRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3, rn4), "rn, rn2, rn3, and rn4 must be sequential"); ASIMDTable(1, 0b00, 0b11, 0b1, rd.V(), rn.V(), rm.V()); } void tbx(DRegister rd, QRegister rn, QRegister rn2, QRegister rn3, QRegister rn4, DRegister rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rn, rn2, rn3, rn4), "rn, rn2, rn3, and rn4 must be sequential"); ASIMDTable(0, 0b00, 0b11, 0b1, rd.V(), rn.V(), rm.V()); } // Advanced SIMD permute void uzp1(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { ASIMDPermute(1, size, 0b001, rd.V(), rn.V(), rm.V()); } void uzp1(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid 64-bit size on 64-bit permute"); ASIMDPermute(0, size, 0b001, rd.V(), rn.V(), rm.V()); } void trn1(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { ASIMDPermute(1, size, 0b010, rd.V(), rn.V(), rm.V()); } void trn1(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid 64-bit size on 64-bit permute"); ASIMDPermute(0, size, 0b010, rd.V(), rn.V(), rm.V()); } void zip1(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { ASIMDPermute(1, size, 0b011, rd.V(), rn.V(), rm.V()); } void zip1(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid 64-bit size on 64-bit permute"); ASIMDPermute(0, size, 0b011, rd.V(), rn.V(), rm.V()); } void uzp2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { ASIMDPermute(1, size, 0b101, rd.V(), rn.V(), rm.V()); } void uzp2(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid 64-bit size on 64-bit permute"); ASIMDPermute(0, size, 0b101, rd.V(), rn.V(), rm.V()); } void trn2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { ASIMDPermute(1, size, 0b110, rd.V(), rn.V(), rm.V()); } void trn2(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid 64-bit size on 64-bit permute"); ASIMDPermute(0, size, 0b110, rd.V(), rn.V(), rm.V()); } void zip2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { ASIMDPermute(1, size, 0b111, rd.V(), rn.V(), rm.V()); } void zip2(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid 64-bit size on 64-bit permute"); ASIMDPermute(0, size, 0b111, rd.V(), rn.V(), rm.V()); } // Advanced SIMD extract void ext(QRegister rd, QRegister rn, QRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 16, "Index can't be more than 15"); ASIMDExtract(1, 0b00, Index, rd.V(), rn.V(), rm.V()); } void ext(DRegister rd, DRegister rn, DRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 8, "Index can't be more than 7"); ASIMDExtract(0, 0b00, Index, rd.V(), rn.V(), rm.V()); } // Advanced SIMD copy template void dup(SubRegSize size, T rd, T rn, uint32_t Index) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit dup"); } constexpr uint32_t Q = std::is_same_v ? 1 : 0; const uint32_t SizeImm = FEXCore::ToUnderlying(size); const uint32_t IndexShift = SizeImm + 1; const uint32_t ElementSize = 1U << SizeImm; [[maybe_unused]] const uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; ASIMDScalarCopy(Q, 0, imm5, 0b0000, rd.V(), rn.V()); } template void dup(SubRegSize size, T rd, Register rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit dup"); } constexpr uint32_t Q = std::is_same_v ? 1 : 0; // Upper bits of imm5 are ignored for GPR dup const uint32_t imm5 = 1U << FEXCore::ToUnderlying(size); ASIMDScalarCopy(Q, 0, imm5, 0b0001, rd, ToVReg(rn)); } template requires (size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit) void smov(XRegister rd, VRegister rn, uint32_t Index) { static_assert(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit, "Unsupported smov size"); constexpr uint32_t SizeImm = FEXCore::ToUnderlying(size); constexpr uint32_t IndexShift = SizeImm + 1; constexpr uint32_t ElementSize = 1U << SizeImm; [[maybe_unused]] constexpr uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; ASIMDScalarCopy(1, 0, imm5, 0b0101, ToVReg(rd), rn); } template requires (size == SubRegSize::i8Bit || size == SubRegSize::i16Bit) void smov(WRegister rd, VRegister rn, uint32_t Index) { static_assert(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit, "Unsupported smov size"); constexpr uint32_t SizeImm = FEXCore::ToUnderlying(size); constexpr uint32_t IndexShift = SizeImm + 1; constexpr uint32_t ElementSize = 1U << SizeImm; [[maybe_unused]] constexpr uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; ASIMDScalarCopy(0, 0, imm5, 0b0101, ToVReg(rd), rn); } template void umov(Register rd, VRegister rn, uint32_t Index) { static_assert(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Unsupported umov size"); constexpr uint32_t Q = size == SubRegSize::i64Bit ? 1 : 0; constexpr uint32_t SizeImm = FEXCore::ToUnderlying(size); constexpr uint32_t IndexShift = SizeImm + 1; constexpr uint32_t ElementSize = 1U << SizeImm; [[maybe_unused]] constexpr uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; ASIMDScalarCopy(Q, 0, imm5, 0b0111, ToVReg(rd), rn); } void ins(SubRegSize size, VRegister rd, uint32_t Index, Register rn) { const uint32_t SizeImm = FEXCore::ToUnderlying(size); const uint32_t IndexShift = SizeImm + 1; const uint32_t ElementSize = 1U << SizeImm; [[maybe_unused]] const uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; ASIMDScalarCopy(1, 0, imm5, 0b0011, rd, ToVReg(rn)); } void ins(SubRegSize size, VRegister rd, uint32_t Index, VRegister rn, uint32_t Index2) { const uint32_t SizeImm = FEXCore::ToUnderlying(size); const uint32_t IndexShift = SizeImm + 1; const uint32_t ElementSize = 1U << SizeImm; [[maybe_unused]] const uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); LOGMAN_THROW_A_FMT(Index2 < MaxIndex, "Index2 too large. Index2={}, Max Index: {}", Index2, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; const uint32_t imm4 = Index2 << SizeImm; ASIMDScalarCopy(1, 0b10, imm5, imm4, rd, rn); } // Advanced SIMD three-register extension template void sdot(ARMEmitter::SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMDThreeRegisterExt(0, 0b0010, size, rm, rn, rd); } template void usdot(T rd, T rn, T rm) { ASIMDThreeRegisterExt(0, 0b0011, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void sqrdmlah(ARMEmitter::SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMDThreeRegisterExt(1, 0b0000, size, rm, rn, rd); } template void sqrdmlsh(ARMEmitter::SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMDThreeRegisterExt(1, 0b0001, size, rm, rn, rd); } template void udot(ARMEmitter::SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMDThreeRegisterExt(1, 0b0010, size, rm, rn, rd); } template void fcmla(ARMEmitter::SubRegSize size, T rd, T rn, T rm, ARMEmitter::Rotation Rot) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i8Bit, "8-bit subregsize not supported"); if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMDThreeRegisterExt(1, 0b1000 | FEXCore::ToUnderlying(Rot), size, rm, rn, rd); } template void fcadd(ARMEmitter::SubRegSize size, T rd, T rn, T rm, ARMEmitter::Rotation Rot) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i8Bit, "8-bit subregsize not supported"); if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(Rot == ARMEmitter::Rotation::ROTATE_90 || Rot == ARMEmitter::Rotation::ROTATE_270, "Invalid rotation"); const uint32_t ConvertedRotation = Rot == ARMEmitter::Rotation::ROTATE_90 ? 0b00 : 0b10; ASIMDThreeRegisterExt(1, 0b1100 | ConvertedRotation, size, rm, rn, rd); } template void bfdot(T rd, T rn, T rm) { ASIMDThreeRegisterExt(1, 0b1111, ARMEmitter::SubRegSize::i16Bit, rm, rn, rd); } void bfmlalb(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm) { ASIMDThreeRegisterExt(1, 0b1111, ARMEmitter::SubRegSize::i64Bit, rm.D(), rn.D(), rd.D()); } void bfmlalt(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm) { ASIMDThreeRegisterExt(1, 0b1111, ARMEmitter::SubRegSize::i64Bit, rm.Q(), rn.Q(), rd.Q()); } void smmla(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm) { ASIMDThreeRegisterExt(0, 0b0100, ARMEmitter::SubRegSize::i32Bit, rm.Q(), rn.Q(), rd.Q()); } void usmmla(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm) { ASIMDThreeRegisterExt(0, 0b0101, ARMEmitter::SubRegSize::i32Bit, rm.Q(), rn.Q(), rd.Q()); } void bfmmla(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm) { ASIMDThreeRegisterExt(1, 0b1101, ARMEmitter::SubRegSize::i16Bit, rm.Q(), rn.Q(), rd.Q()); } void ummla(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm) { ASIMDThreeRegisterExt(1, 0b0100, ARMEmitter::SubRegSize::i32Bit, rm.Q(), rn.Q(), rd.Q()); } // Advanced SIMD two-register miscellaneous template void rev64(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD2RegMisc(0, size, 0b00000, rd, rn); } template void rev16(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit, "Only 8-bit subregsize supported"); ASIMD2RegMisc(0, size, 0b00001, rd, rn); } ///< size is the destination size. ///< source size is the next size up. template void saddlp(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD2RegMisc(0, ConvertedSize, 0b00010, rd, rn); } template void suqadd(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(0, size, 0b00011, rd, rn); } template void cls(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD2RegMisc(0, size, 0b00100, rd, rn); } template void cnt(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit, "Only 8-bit subregsize supported"); ASIMD2RegMisc(0, size, 0b00101, rd, rn); } ///< size is the destination size. ///< source size is the next size up. template void sadalp(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD2RegMisc(0, ConvertedSize, 0b00110, rd, rn); } template void sqabs(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(0, size, 0b00111, rd, rn); } // Comparison against zero template void cmgt(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(0, size, 0b01000, rd, rn); } // Comparison against zero template void cmeq(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(0, size, 0b01001, rd, rn); } // Comparison against zero template void cmlt(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(0, size, 0b01010, rd, rn); } template void abs(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(0, size, 0b01011, rd, rn); } ///< size is the destination size. ///< source size is the next size up. void xtn(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit destination subregsize not supported"); ASIMD2RegMisc(0, size, 0b10010, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void xtn2(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit destination subregsize not supported"); ASIMD2RegMisc(0, size, 0b10010, rd.Q(), rn.Q()); } ///< size is the destination size. ///< source size is the next size up. void sqxtn(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit destination subregsize not supported"); ASIMD2RegMisc(0, size, 0b10100, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void sqxtn2(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit destination subregsize not supported"); ASIMD2RegMisc(0, size, 0b10100, rd.Q(), rn.Q()); } ///< size is the destination size. ///< source size is the next size up. void fcvtn(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i16Bit, "Only 16-bit & 32-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i32Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b10110, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void fcvtn2(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i16Bit, "Only 16-bit & 32-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i32Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b10110, rd.Q(), rn.Q()); } ///< size is the destination size. ///< source size is the next size up. void fcvtl(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b10111, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void fcvtl2(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b10111, rd.Q(), rn.Q()); } template void frintn(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 0, 0b11000, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11000, rd, rn); } } template void frintm(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 0, 0b11001, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11001, rd, rn); } } template void fcvtns(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 0, 0b11010, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11010, rd, rn); } } template void fcvtms(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 0, 0b11011, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11011, rd, rn); } } template void fcvtas(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 0, 0b11100, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11100, rd, rn); } } template void scvtf(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 0, 0b11101, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11101, rd, rn); } } template void frint32z(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11110, rd, rn); } template void frint64z(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(0, ConvertedSize, 0b11111, rd, rn); } template void fcmgt(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b01100, rn, rd); } else { ASIMD2RegMisc(0, size, 0b01100, rd, rn); } } template void fcmeq(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b01101, rn, rd); } else { ASIMD2RegMisc(0, size, 0b01101, rd, rn); } } template void fcmlt(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b01110, rn, rd); } else { ASIMD2RegMisc(0, size, 0b01110, rd, rn); } } template void fabs(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b01111, rn, rd); } else { ASIMD2RegMisc(0, size, 0b01111, rd, rn); } } template void frintp(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b11000, rn, rd); } else { ASIMD2RegMisc(0, size, 0b11000, rd, rn); } } template void frintz(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b11001, rn, rd); } else { ASIMD2RegMisc(0, size, 0b11001, rd, rn); } } template void fcvtps(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b11010, rn, rd); } else { ASIMD2RegMisc(0, size, 0b11010, rd, rn); } } template void fcvtzs(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b11011, rn, rd); } else { ASIMD2RegMisc(0, size, 0b11011, rd, rn); } } template void urecpe(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit, "Only 32-bit subregsize supported"); ASIMD2RegMisc(0, size, 0b11100, rd, rn); } template void frecpe(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(0, 1, 0b11101, rn, rd); } else { ASIMD2RegMisc(0, size, 0b11101, rd, rn); } } template void rev32(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit, "Only 8-bit & 16-bit subregsize supported"); ASIMD2RegMisc(1, size, 0b00000, rd, rn); } ///< size is the destination size. ///< source size is the next size up. template void uaddlp(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD2RegMisc(1, ConvertedSize, 0b00010, rd, rn); } template void usqadd(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(1, size, 0b00011, rd, rn); } template void clz(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD2RegMisc(1, size, 0b00100, rd, rn); } ///< size is the destination size. ///< source size is the next size up. template void uadalp(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD2RegMisc(1, ConvertedSize, 0b00110, rd, rn); } template void sqneg(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(1, size, 0b00111, rd, rn); } // Comparison against zero template void cmge(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(1, size, 0b01000, rd, rn); } // Comparison against zero template void cmle(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(1, size, 0b01001, rd, rn); } template void neg(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } ASIMD2RegMisc(1, size, 0b01011, rd, rn); } ///< size is the destination size. ///< source size is the next size up. void sqxtun(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit destination subregsize not supported"); ASIMD2RegMisc(1, size, 0b10010, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void sqxtun2(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit destination subregsize not supported"); ASIMD2RegMisc(1, size, 0b10010, rd.Q(), rn.Q()); } ///< size is the destination size. ///< source size is the next size up. void shll(SubRegSize size, DRegister rd, DRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD2RegMisc(1, ConvertedSize, 0b10011, rd, rn); } ///< size is the destination size. ///< source size is the next size up. void shll2(SubRegSize size, QRegister rd, QRegister rn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD2RegMisc(1, ConvertedSize, 0b10011, rd, rn); } ///< size is the destination size. ///< source size is the next size up. void uqxtn(SubRegSize size, VRegister rd, VRegister rn) { ASIMD2RegMisc(1, size, 0b10100, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void uqxtn2(SubRegSize size, VRegister rd, VRegister rn) { ASIMD2RegMisc(1, size, 0b10100, rd.Q(), rn.Q()); } ///< size is the destination size. ///< source size is the next size up. void fcvtxn(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit, "Only 32-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i32Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b10110, rd.D(), rn.D()); } ///< size is the destination size. ///< source size is the next size up. void fcvtxn2(SubRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit, "Only 32-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i32Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b10110, rd.Q(), rn.Q()); } template void frinta(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 0, 0b11000, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11000, rd, rn); } } template void frintx(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 0, 0b11001, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11001, rd, rn); } } template void fcvtnu(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 0, 0b11010, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11010, rd, rn); } } template void fcvtmu(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 0, 0b11011, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11011, rd, rn); } } template void fcvtau(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 0, 0b11100, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11100, rd, rn); } } template void ucvtf(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 0, 0b11101, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11101, rd, rn); } } template void frint32x(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11110, rd, rn); } template void frint64x(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD2RegMisc(1, ConvertedSize, 0b11111, rd, rn); } template void not_(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit, "Only 8-bit subregsize supported"); ASIMD2RegMisc(1, SubRegSize::i8Bit, 0b00101, rd, rn); } template void mvn(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit, "Only 8-bit subregsize supported"); ASIMD2RegMisc(1, SubRegSize::i8Bit, 0b00101, rd, rn); } template void rbit(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit, "Only 8-bit subregsize supported"); ASIMD2RegMisc(1, SubRegSize::i16Bit, 0b00101, rd, rn); } template void fcmge(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b01100, rn, rd); } else { ASIMD2RegMisc(1, size, 0b01100, rd, rn); } } template void fcmle(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b01101, rn, rd); } else { ASIMD2RegMisc(1, size, 0b01101, rd, rn); } } template void fneg(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b01111, rn, rd); } else { ASIMD2RegMisc(1, size, 0b01111, rd, rn); } } template void frinti(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b11001, rn, rd); } else { ASIMD2RegMisc(1, size, 0b11001, rd, rn); } } template void fcvtpu(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b11010, rn, rd); } else { ASIMD2RegMisc(1, size, 0b11010, rd, rn); } } template void fcvtzu(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b11011, rn, rd); } else { ASIMD2RegMisc(1, size, 0b11011, rd, rn); } } template void ursqrte(SubRegSize size, T rd, T rn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit, "Only 32-bit & 64-bit subregsize supported"); ASIMD2RegMisc(1, size, 0b11100, rd, rn); } template void frsqrte(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b11101, rn, rd); } else { ASIMD2RegMisc(1, size, 0b11101, rd, rn); } } template void fsqrt(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDTwoRegMiscFP16(1, 1, 0b11111, rn, rd); } else { ASIMD2RegMisc(1, size, 0b11111, rd, rn); } } // Advanced SIMD across lanes ///< size is the destination size. ///< source size is the next size up. template void saddlv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMDAcrossLanes(0, ConvertedSize, 0b00011, rd, rn); } template void smaxv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit && size != SubRegSize::i64Bit, "32/64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Destination 64-bit subregsize unsupported"); ASIMDAcrossLanes(0, size, 0b01010, rd, rn); } template void sminv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit && size != SubRegSize::i64Bit, "32/64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Destination 64-bit subregsize unsupported"); ASIMDAcrossLanes(0, size, 0b11010, rd, rn); } template void addv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit && size != SubRegSize::i64Bit, "32/64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Destination 64-bit subregsize unsupported"); ASIMDAcrossLanes(0, size, 0b11011, rd, rn); } template void uaddlv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Destination 8-bit subregsize unsupported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMDAcrossLanes(1, ConvertedSize, 0b00011, rd, rn); } template void umaxv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit && size != SubRegSize::i64Bit, "32/64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Destination 64-bit subregsize unsupported"); ASIMDAcrossLanes(1, size, 0b01010, rd, rn); } template void uminv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit && size != SubRegSize::i64Bit, "32/64-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Destination 64-bit subregsize unsupported"); ASIMDAcrossLanes(1, size, 0b11010, rd, rn); } template void fmaxnmv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit, "32-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i64Bit, "Destination 8/64-bit subregsize unsupported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; const auto U = size == SubRegSize::i16Bit ? 0 : 1; ASIMDAcrossLanes(U, ConvertedSize, 0b01100, rd, rn); } template void fmaxv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit, "32-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i64Bit, "Destination 8/64-bit subregsize unsupported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; const auto U = size == ARMEmitter::SubRegSize::i16Bit ? 0 : 1; ASIMDAcrossLanes(U, ConvertedSize, 0b01111, rd, rn); } template void fminnmv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit, "32-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i64Bit, "Destination 8/64-bit subregsize unsupported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i64Bit : SubRegSize::i32Bit; const auto U = size == SubRegSize::i16Bit ? 0 : 1; ASIMDAcrossLanes(U, ConvertedSize, 0b01100, rd, rn); } template void fminv(SubRegSize size, T rd, T rn) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i32Bit, "32-bit subregsize not supported"); } LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i64Bit, "Destination 8/64-bit subregsize unsupported"); const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i64Bit : SubRegSize::i32Bit; const auto U = size == SubRegSize::i16Bit ? 0 : 1; ASIMDAcrossLanes(U, ConvertedSize, 0b01111, rd, rn); } // Advanced SIMD three different // TODO: Double check narrowing op size limits. // TODO: Don't enforce DRegister/QRegister for Q check ///< Size is dest size void saddl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void saddl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void saddw(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0001, ConvertedSize, rd, rn, rm); } ///< Size is dest size void saddw2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0001, ConvertedSize, rd, rn, rm); } ///< Size is dest size void ssubl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void ssubl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void ssubw(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0011, ConvertedSize, rd, rn, rm); } ///< Size is dest size void ssubw2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0011, ConvertedSize, rd, rn, rm); } void addhn(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(0, 0b0100, size, rd, rn, rm); } void addhn2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(0, 0b0100, size, rd, rn, rm); } ///< Size is dest size void sabal(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0101, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sabal2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0101, ConvertedSize, rd, rn, rm); } void subhn(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(0, 0b0110, size, rd, rn, rm); } void subhn2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(0, 0b0110, size, rd, rn, rm); } ///< Size is dest size void sabdl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0111, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sabdl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b0111, ConvertedSize, rd, rn, rm); } ///< Size is dest size void smlal(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void smlal2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sqdmlal(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "No 8/16-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1001, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sqdmlal2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "No 8/16-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1001, ConvertedSize, rd, rn, rm); } ///< Size is dest size void smlsl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void smlsl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sqdmlsl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "No 8/16-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1011, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sqdmlsl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "No 8/16-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1011, ConvertedSize, rd, rn, rm); } ///< Size is dest size void smull(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1100, ConvertedSize, rd, rn, rm); } ///< Size is dest size void smull2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1100, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sqdmull(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "No 8/16-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1101, ConvertedSize, rd, rn, rm); } ///< Size is dest size void sqdmull2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "No 8/16-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1101, ConvertedSize, rd, rn, rm); } ///< Size is dest size void pmull(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i128Bit, "Only 16-bit and 128-bit destination supported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1110, ConvertedSize, rd, rn, rm); } ///< Size is dest size void pmull2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i128Bit, "Only 16-bit and 128-bit destination supported"); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(0, 0b1110, ConvertedSize, rd, rn, rm); } ///< Size is dest size void uaddl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void uaddl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void uaddw(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0001, ConvertedSize, rd, rn, rm); } ///< Size is dest size void uaddw2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0001, ConvertedSize, rd, rn, rm); } ///< Size is dest size void usubl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void usubl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void usubw(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0011, ConvertedSize, rd, rn, rm); } ///< Size is dest size void usubw2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0011, ConvertedSize, rd, rn, rm); } ///< Size is dest size void raddhn(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(1, 0b0100, size, rd, rn, rm); } ///< Size is dest size void raddhn2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(1, 0b0100, size, rd, rn, rm); } ///< Size is dest size void uabal(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0101, ConvertedSize, rd, rn, rm); } ///< Size is dest size void uabal2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0101, ConvertedSize, rd, rn, rm); } ///< Size is dest size void rsubhn(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(1, 0b0110, size, rd, rn, rm); } ///< Size is dest size void rsubhn2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "No 64-bit dest support."); ASIMD3Different(1, 0b0110, size, rd, rn, rm); } ///< Size is dest size void uabdl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0111, ConvertedSize, rd, rn, rm); } ///< Size is dest size void uabdl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b0111, ConvertedSize, rd, rn, rm); } ///< Size is dest size void umlal(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b1000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void umlal2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b1000, ConvertedSize, rd, rn, rm); } ///< Size is dest size void umlsl(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b1010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void umlsl2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b1010, ConvertedSize, rd, rn, rm); } ///< Size is dest size void umull(SubRegSize size, DRegister rd, DRegister rn, DRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b1100, ConvertedSize, rd, rn, rm); } ///< Size is dest size void umull2(SubRegSize size, QRegister rd, QRegister rn, QRegister rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); const auto ConvertedSize = SubRegSize {FEXCore::ToUnderlying(size) - 1}; ASIMD3Different(1, 0b1100, ConvertedSize, rd, rn, rm); } // Advanced SIMD three same template void shadd(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b00000, rd, rn, rm); } template void sqadd(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit sqadd"); } ASIMD3Same(0, size, 0b00001, rd, rn, rm); } template void srhadd(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b00010, rd, rn, rm); } template void shsub(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b00100, rd, rn, rm); } template void sqsub(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit sqsub"); } ASIMD3Same(0, size, 0b00101, rd, rn, rm); } template void cmgt(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit cmgt"); } ASIMD3Same(0, size, 0b00110, rd, rn, rm); } template void cmge(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit cmge"); } ASIMD3Same(0, size, 0b00111, rd, rn, rm); } template void sshl(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit sshl"); } ASIMD3Same(0, size, 0b01000, rd, rn, rm); } template void sqshl(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit sqshl"); } ASIMD3Same(0, size, 0b01001, rd, rn, rm); } template void srshl(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit srshl"); } ASIMD3Same(0, size, 0b01010, rd, rn, rm); } template void sqrshl(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit sqrshl"); } ASIMD3Same(0, size, 0b01011, rd, rn, rm); } template void smax(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b01100, rd, rn, rm); } template void smin(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b01101, rd, rn, rm); } template void sabd(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b01110, rd, rn, rm); } template void saba(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b01111, rd, rn, rm); } template void add(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit add"); } ASIMD3Same(0, size, 0b10000, rd, rn, rm); } template void cmtst(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit cmtst"); } ASIMD3Same(0, size, 0b10001, rd, rn, rm); } template void mla(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b10010, rd, rn, rm); } template void mul(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b10011, rd, rn, rm); } template void smaxp(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b10100, rd, rn, rm); } template void sminp(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b10101, rd, rn, rm); } template void sqdmulh(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "No 8-bit dest support."); LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(0, size, 0b10110, rd, rn, rm); } template void addp(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(0, size, 0b10111, rd, rn, rm); } template void fmaxnm(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b000, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11000, rd, rn, rm); } } template void fmla(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b001, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11001, rd, rn, rm); } } template void fadd(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b010, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11010, rd, rn, rm); } } template void fmulx(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b011, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11011, rd, rn, rm); } } template void fcmeq(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b100, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11100, rd, rn, rm); } } template void fmax(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b110, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11110, rd, rn, rm); } } template void frecps(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 0, 0b111, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(0, ConvertedSize, 0b11111, rd, rn, rm); } } template void and_(T rd, T rn, T rm) { ASIMD3Same(0, SubRegSize::i8Bit, 0b00011, rd, rn, rm); } template void fmlal(T rd, T rn, T rm) { ASIMD3Same(0, SubRegSize::i8Bit, 0b11101, rd, rn, rm); } template void fmlal2(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i8Bit, 0b11001, rd, rn, rm); } template void bic(T rd, T rn, T rm) { ASIMD3Same(0, SubRegSize::i16Bit, 0b00011, rd, rn, rm); } template void fminnm(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 1, 0b000, rm, rn, rd); } else { ASIMD3Same(0, size, 0b11000, rd, rn, rm); } } template void fmls(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 1, 0b001, rm, rn, rd); } else { ASIMD3Same(0, size, 0b11001, rd, rn, rm); } } template void fsub(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 1, 0b010, rm, rn, rd); } else { ASIMD3Same(0, size, 0b11010, rd, rn, rm); } } template void fmin(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 1, 0b110, rm, rn, rd); } else { ASIMD3Same(0, size, 0b11110, rd, rn, rm); } } template void frsqrts(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(0, 1, 0b111, rm, rn, rd); } else { ASIMD3Same(0, size, 0b11111, rd, rn, rm); } } template void orr(T rd, T rn, T rm) { ASIMD3Same(0, SubRegSize::i32Bit, 0b00011, rd, rn, rm); } template void mov(T rd, T rn) { orr(rd, rn, rn); } template void fmlsl(T rd, T rn, T rm) { ASIMD3Same(0, SubRegSize::i32Bit, 0b11101, rd, rn, rm); } template void fmlsl2(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i32Bit, 0b11001, rd, rn, rm); } template void orn(T rd, T rn, T rm) { ASIMD3Same(0, SubRegSize::i64Bit, 0b00011, rd, rn, rm); } template void uhadd(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b00000, rd, rn, rm); } template void uqadd(SubRegSize size, T rd, T rn, T rm) { ASIMD3Same(1, size, 0b00001, rd, rn, rm); } template void urhadd(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b00010, rd, rn, rm); } template void uhsub(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b00100, rd, rn, rm); } template void uqsub(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b00101, rd, rn, rm); } template void cmhi(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(1, size, 0b00110, rd, rn, rm); } template void cmhs(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(1, size, 0b00111, rd, rn, rm); } template void ushl(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(1, size, 0b01000, rd, rn, rm); } template void uqshl(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b01001, rd, rn, rm); } template void urshl(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(1, size, 0b01010, rd, rn, rm); } template void uqrshl(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b01011, rd, rn, rm); } template void umax(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b01100, rd, rn, rm); } template void umin(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b01101, rd, rn, rm); } template void uabd(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b01110, rd, rn, rm); } template void uaba(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b01111, rd, rn, rm); } template void sub(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(1, size, 0b10000, rd, rn, rm); } template void cmeq(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } ASIMD3Same(1, size, 0b10001, rd, rn, rm); } template void mls(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b10010, rd, rn, rm); } template void pmul(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i8Bit, 0b10011, rd, rn, rm); } template void umaxp(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b10100, rd, rn, rm); } template void uminp(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "64-bit subregsize not supported"); ASIMD3Same(1, size, 0b10101, rd, rn, rm); } template void sqrdmulh(SubRegSize size, T rd, T rn, T rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit && size != SubRegSize::i8Bit, "8/64-bit subregsize not supported"); ASIMD3Same(1, size, 0b10110, rd, rn, rm); } template void fmaxnmp(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b000, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11000, rd, rn, rm); } } template void faddp(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b010, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11010, rd, rn, rm); } } template void fmul(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b011, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11011, rd, rn, rm); } } template void fcmge(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b100, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11100, rd, rn, rm); } } template void facge(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b101, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11101, rd, rn, rm); } } template void fmaxp(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b110, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11110, rd, rn, rm); } } template void fdiv(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 0, 0b111, rm, rn, rd); } else { const auto ConvertedSize = size == SubRegSize::i64Bit ? SubRegSize::i16Bit : SubRegSize::i8Bit; ASIMD3Same(1, ConvertedSize, 0b11111, rd, rn, rm); } } template void eor(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i8Bit, 0b00011, rd, rn, rm); } template void bsl(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i16Bit, 0b00011, rd, rn, rm); } template void fminnmp(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 1, 0b000, rm, rn, rd); } else { ASIMD3Same(1, size, 0b11000, rd, rn, rm); } } template void fabd(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 1, 0b010, rm, rn, rd); } else { ASIMD3Same(1, size, 0b11010, rd, rn, rm); } } template void fcmgt(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 1, 0b100, rm, rn, rd); } else { ASIMD3Same(1, size, 0b11100, rd, rn, rm); } } template void facgt(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 1, 0b101, rm, rn, rd); } else { ASIMD3Same(1, size, 0b11101, rd, rn, rm); } } template void fminp(SubRegSize size, T rd, T rn, T rm) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Only 16/32/64-bit subregsize supported"); if (size == SubRegSize::i16Bit) { ASIMDThreeSameFP16(1, 1, 0b110, rm, rn, rd); } else { ASIMD3Same(1, size, 0b11110, rd, rn, rm); } } template void bit(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i32Bit, 0b00011, rd, rn, rm); } template void bif(T rd, T rn, T rm) { ASIMD3Same(1, SubRegSize::i64Bit, 0b00011, rd, rn, rm); } // Advanced SIMD modified immediate // XXX: ORR - 32-bit/16-bit // XXX: MOVI - Shifting ones template void fmov(SubRegSize size, T rd, float Value) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Unsupported fmov size"); uint32_t op; uint32_t cmode = 0b1111; uint32_t o2; uint32_t Imm; if (size == SubRegSize::i16Bit) { LOGMAN_MSG_A_FMT("Unsupported"); FEX_UNREACHABLE; } else if (size == SubRegSize::i32Bit) { op = 0; o2 = 0; Imm = FP32ToImm8(Value); } else if (size == SubRegSize::i64Bit) { op = 1; o2 = 0; Imm = FP64ToImm8(Value); } else { LOGMAN_MSG_A_FMT("Invalid subregsize"); FEX_UNREACHABLE; } ASIMDModifiedImm(op, cmode, o2, Imm, rd); } // XXX: MVNI - Shifted immediate // XXX: BIC // void ASIMDModifiedImm(uint32_t Op, uint32_t op, uint32_t cmode, uint32_t o2, uint32_t imm, T rd) { template void movi(SubRegSize size, T rd, uint64_t Imm, uint16_t Shift = 0) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Unsupported movi size"); uint32_t cmode; uint32_t op; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Shift == 0, "8-bit can't have shift"); LOGMAN_THROW_A_FMT((Imm & ~0xFF) == 0, "Larger than 8-bit Imm not supported"); cmode = 0b1110; op = 0; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 8, "Shift by invalid amount"); LOGMAN_THROW_A_FMT((Imm & ~0xFF) == 0, "Larger than 8-bit Imm not supported"); cmode = 0b1000 | (Shift ? 0b10 : 0b00); op = 0; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24, "Shift by invalid amount"); LOGMAN_THROW_A_FMT((Imm & ~0xFF) == 0, "Larger than 8-bit Imm not supported"); cmode = 0b0000 | ((Shift >> 3) << 1); op = 0; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Shift == 0, "64-bit can't have shift"); cmode = 0b1110; op = 1; // 64-bit movi doesn't behave like the smaller types // Each bit of the 8-bit imm encoding is expanded to a full 8-bits. // This gives us a full 64-bits for the final result but needs special handling. uint8_t NewImm {}; for (size_t i = 0; i < 8; ++i) { const size_t BitOffset = i * 8; uint8_t Section = (Imm >> BitOffset) & 0xFF; LOGMAN_THROW_A_FMT(Section == 0 || Section == 0xFF, "Invalid 64-bit constant encoding"); if (Section == 0xFF) { NewImm |= (1 << i); } } Imm = NewImm; } else { LOGMAN_MSG_A_FMT("Invalid subregsize"); FEX_UNREACHABLE; } ASIMDModifiedImm(op, cmode, 0, Imm, rd); } // Advanced SIMD shift by immediate template void sshr(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b00000, rn, rd); } template void ssra(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b00010, rn, rd); } template void srshr(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b00100, rn, rd); } template void srsra(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b00110, rn, rd); } template void shl(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b01010, rn, rd); } template void sqshl(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b01110, rn, rd); } ///< size is destination size void shrn(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10000, rn, rd); } ///< size is destination size void shrn2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10000, rn, rd); } ///< size is destination size void rshrn(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10001, rn, rd); } ///< size is destination size void rshrn2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10001, rn, rd); } ///< size is destination size void sqshrn(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10010, rn, rd); } ///< size is destination size void sqshrn2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10010, rn, rd); } ///< size is destination size void sqrshrn(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10011, rn, rd); } ///< size is destination size void sqrshrn2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - (Shift); const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10011, rn, rd); } ///< size is destination size void sshll(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); size = SubRegSize(FEXCore::ToUnderlying(size) - 1); const size_t SubregSizeInBits = SubRegSizeInBits(size); LOGMAN_THROW_A_FMT(Shift < SubregSizeInBits, "Shift must not be larger than incoming element size"); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10100, rn, rd); } ///< size is destination size void sshll2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); size = SubRegSize(FEXCore::ToUnderlying(size) - 1); const size_t SubregSizeInBits = SubRegSizeInBits(size); LOGMAN_THROW_A_FMT(Shift < SubregSizeInBits, "Shift must not be larger than incoming element size"); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(0, immh, immb, 0b10100, rn, rd); } ///< size is destination size void sxtl(SubRegSize size, VRegister rd, VRegister rn) { sshll(size, rd.D(), rn.D(), 0); } ///< size is destination size void sxtl2(SubRegSize size, VRegister rd, VRegister rn) { sshll2(size, rd.Q(), rn.Q(), 0); } template void scvtf(SubRegSize size, T rd, T rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); LOGMAN_THROW_A_FMT(FractionalBits < SubregSizeInBits, "FractionalBits must not be larger than incoming element size"); // fbits encoded a bit weirdly. // fbits = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedFractionalBits = (SubregSizeInBits * 2) - FractionalBits; const uint32_t immh = InvertedFractionalBits >> 3; const uint32_t immb = InvertedFractionalBits & 0b111; ASIMDShiftByImm(0, immh, immb, 0b11100, rn, rd); } template void fcvtzs(SubRegSize size, T rd, T rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); LOGMAN_THROW_A_FMT(FractionalBits < SubregSizeInBits, "FractionalBits must not be larger than incoming element size"); // fbits encoded a bit weirdly. // fbits = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedFractionalBits = (SubregSizeInBits * 2) - FractionalBits; const uint32_t immh = InvertedFractionalBits >> 3; const uint32_t immb = InvertedFractionalBits & 0b111; ASIMDShiftByImm(0, immh, immb, 0b11111, rn, rd); } template void ushr(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b00000, rn, rd); } template void usra(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b00010, rn, rd); } template void urshr(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b00100, rn, rd); } template void ursra(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b00110, rn, rd); } template void sri(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b01000, rn, rd); } template void sli(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b01010, rn, rd); } template void sqshlu(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b01100, rn, rd); } ///< size is destination size template void uqshl(SubRegSize size, T rd, T rn, uint32_t Shift) { if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b01110, rn, rd); } ///< size is destination size void sqshrun(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10000, rn, rd); } ///< size is destination size void sqshrun2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10000, rn, rd); } ///< size is destination size void sqrshrun(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10001, rn, rd); } ///< size is destination size void sqrshrun2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10001, rn, rd); } ///< size is destination size void uqshrn(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10010, rn, rd); } ///< size is destination size void uqshrn2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10010, rn, rd); } ///< size is destination size void uqrshrn(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10011, rn, rd); } ///< size is destination size void uqrshrn2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10011, rn, rd); } ///< size is destination size void ushll(SubRegSize size, DRegister rd, DRegister rn, uint32_t Shift) { size = SubRegSize(FEXCore::ToUnderlying(size) - 1); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10100, rn, rd); } ///< size is destination size void ushll2(SubRegSize size, QRegister rd, QRegister rn, uint32_t Shift) { size = SubRegSize(FEXCore::ToUnderlying(size) - 1); const size_t SubregSizeInBits = SubRegSizeInBits(size); // Shift encoded a bit weirdly. // shift = immh:immb - esize but immh is /also/ used for element size. const uint32_t InvertedShift = SubregSizeInBits + Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDShiftByImm(1, immh, immb, 0b10100, rn, rd); } void uxtl(SubRegSize size, DRegister rd, DRegister rn) { ushll(size, rd, rn, 0); } void uxtl2(SubRegSize size, QRegister rd, QRegister rn) { ushll2(size, rd, rn, 0); } template void ucvtf(SubRegSize size, T rd, T rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); LOGMAN_THROW_A_FMT(FractionalBits < SubregSizeInBits, "FractionalBits must not be larger than incoming element size"); // fbits encoded a bit weirdly. // fbits = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedFractionalBits = (SubregSizeInBits * 2) - FractionalBits; const uint32_t immh = InvertedFractionalBits >> 3; const uint32_t immb = InvertedFractionalBits & 0b111; ASIMDShiftByImm(1, immh, immb, 0b11100, rn, rd); } template void fcvtzu(SubRegSize size, T rd, T rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); if constexpr (std::is_same_v) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Invalid element size with 64-bit {}", __func__); } const size_t SubregSizeInBits = SubRegSizeInBits(size); LOGMAN_THROW_A_FMT(FractionalBits < SubregSizeInBits, "FractionalBits must not be larger than incoming element size"); // fbits encoded a bit weirdly. // fbits = (esize * 2) - immh:immb but immh is /also/ used for element size. const uint32_t InvertedFractionalBits = (SubregSizeInBits * 2) - FractionalBits; const uint32_t immh = InvertedFractionalBits >> 3; const uint32_t immb = InvertedFractionalBits & 0b111; ASIMDShiftByImm(1, immh, immb, 0b11111, rn, rd); } // Advanced SIMD vector x indexed element ///< size is destination size void smlal(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0010, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void smlal2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0010, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } ///< size is destination size void sqdmlal(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0011, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void sqdmlal2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0011, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } ///< size is destination size void smlsl(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0110, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void smlsl2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0110, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } ///< size is destination size void sqdmlsl(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0111, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void sqdmlsl2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0111, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } template void mul(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1000, H, size, rm, rn, rd); } ///< size is destination size void smull(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1010, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void smull2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1010, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } ///< size is destination size void sqdmull(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1011, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void sqdmull2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1011, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } template void sqdmulh(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1100, H, size, rm, rn, rd); } template void sqrdmulh(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1101, H, size, rm, rn, rd); } template void sdot(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 4, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; ASIMDVectorXIndexedElement(0b0, L, M, 0b1110, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void sudot(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 4, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; ASIMDVectorXIndexedElement(0b0, L, M, 0b1111, H, ARMEmitter::SubRegSize::i8Bit, rm, rn, rd); } template void bfdot(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 4, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; ASIMDVectorXIndexedElement(0b0, L, M, 0b1111, H, ARMEmitter::SubRegSize::i16Bit, rm, rn, rd); } template void fmla(SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Invalid destination size"); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; auto EncodedSubRegSize = size; if (size == SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; // ARM in their infinite wisdom decided to encode 16-bit as an 8-bit operation even though 16-bit was unallocated. EncodedSubRegSize = SubRegSize::i8Bit; } else if (size == SubRegSize::i32Bit) { // Index encoded in H:L H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } else { LOGMAN_THROW_A_FMT((std::is_same_v), "Can't encode DRegister with i64Bit"); // Index encoded in H H = Index; L = 0; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0001, H, EncodedSubRegSize, rm, rn, rd); } template void fmls(SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Invalid destination size"); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; auto EncodedSubRegSize = size; if (size == SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; // ARM in their infinite wisdom decided to encode 16-bit as an 8-bit operation even though 16-bit was unallocated. EncodedSubRegSize = SubRegSize::i8Bit; } else if (size == SubRegSize::i32Bit) { // Index encoded in H:L H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } else { LOGMAN_THROW_A_FMT((std::is_same_v), "Can't encode DRegister with i64Bit"); // Index encoded in H H = Index; L = 0; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b0101, H, EncodedSubRegSize, rm, rn, rd); } template void fmul(SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Invalid destination size"); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; auto EncodedSubRegSize = size; if (size == SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; // ARM in their infinite wisdom decided to encode 16-bit as an 8-bit operation even though 16-bit was unallocated. EncodedSubRegSize = SubRegSize::i8Bit; } else if (size == SubRegSize::i32Bit) { // Index encoded in H:L H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } else { LOGMAN_THROW_A_FMT((std::is_same_v), "Can't encode DRegister with i64Bit"); // Index encoded in H H = Index; L = 0; M = 0; } ASIMDVectorXIndexedElement(0b0, L, M, 0b1001, H, EncodedSubRegSize, rm, rn, rd); } template void fmlal(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 8, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; ASIMDVectorXIndexedElement(0b0, L, M, 0b0000, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void fmlal2(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 8, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; ASIMDVectorXIndexedElement(0b1, L, M, 0b1000, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void fmlsl(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 8, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; ASIMDVectorXIndexedElement(0b0, L, M, 0b0100, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void fmlsl2(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 8, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; ASIMDVectorXIndexedElement(0b1, L, M, 0b1100, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void usdot(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 4, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; ASIMDVectorXIndexedElement(0b0, L, M, 0b1111, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } void bfmlalb(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); LOGMAN_THROW_A_FMT(Index < 8, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; ASIMDVectorXIndexedElement(0b0, L, M, 0b1111, H, ARMEmitter::SubRegSize::i64Bit, rm.D(), rn.D(), rd.D()); } void bfmlalt(ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); LOGMAN_THROW_A_FMT(Index < 8, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; ASIMDVectorXIndexedElement(0b0, L, M, 0b1111, H, ARMEmitter::SubRegSize::i64Bit, rm.Q(), rn.Q(), rd.Q()); } template void mla(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b0000, H, size, rm, rn, rd); } ///< size is destination size void umlal(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b0010, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void umlal2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b0010, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } template void mls(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b0100, H, size, rm, rn, rd); } ///< size is destination size void umlsl(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b0110, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void umlsl2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b0110, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } ///< size is destination size void umull(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b1010, H, EncodedSubRegSize, rm.D(), rn.D(), rd.D()); } ///< size is destination size void umull2(ARMEmitter::SubRegSize size, ARMEmitter::VRegister rd, ARMEmitter::VRegister rn, ARMEmitter::VRegister rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i32Bit || size == ARMEmitter::SubRegSize::i64Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } const auto EncodedSubRegSize = ARMEmitter::SubRegSize(FEXCore::ToUnderlying(size) - 1); LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(EncodedSubRegSize), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i32Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b1010, H, EncodedSubRegSize, rm.Q(), rn.Q(), rd.Q()); } template void sqrdmlah(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b1101, H, size, rm, rn, rd); } template void udot(T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(Index < 4, "Index must be less than the source register size"); uint32_t H, L, M; // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; ASIMDVectorXIndexedElement(0b1, L, M, 0b1110, H, ARMEmitter::SubRegSize::i32Bit, rm, rn, rd); } template void sqrdmlsh(ARMEmitter::SubRegSize size, T rd, T rn, T rm, uint32_t Index) { LOGMAN_THROW_A_FMT(size == ARMEmitter::SubRegSize::i16Bit || size == ARMEmitter::SubRegSize::i32Bit, "Invalid destination size"); if (size == ARMEmitter::SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm.Idx() < 16, "Rm can't be v16-v31 with half source size"); } LOGMAN_THROW_A_FMT(Index < SubRegSizeInBits(size), "Index must be less than the source register size"); uint32_t H, L, M; if (size == ARMEmitter::SubRegSize::i16Bit) { // Index encoded in H:L:M H = (Index >> 2) & 1; L = (Index >> 1) & 1; M = (Index >> 0) & 1; } else { // Index encoded in H:L // M overlaps rm register. H = (Index >> 1) & 1; L = (Index >> 0) & 1; M = 0; } ASIMDVectorXIndexedElement(0b1, L, M, 0b1111, H, size, rm, rn, rd); } // Cryptographic three-register, imm2 void sm3tt1a(VRegister rd, VRegister rn, VRegister rm, uint32_t index) { Crypto3RegImm(index, 0b00, rm, rn, rd); } void sm3tt1b(VRegister rd, VRegister rn, VRegister rm, uint32_t index) { Crypto3RegImm(index, 0b01, rm, rn, rd); } void sm3tt2a(VRegister rd, VRegister rn, VRegister rm, uint32_t index) { Crypto3RegImm(index, 0b10, rm, rn, rd); } void sm3tt2b(VRegister rd, VRegister rn, VRegister rm, uint32_t index) { Crypto3RegImm(index, 0b11, rm, rn, rd); } // Cryptographic three-register SHA 512 void sha512h(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(0, 0b00, rm, rn, rd); } void sha512h2(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(0, 0b01, rm, rn, rd); } void sha512su1(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(0, 0b10, rm, rn, rd); } void rax1(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(0, 0b11, rm, rn, rd); } void sm3partw1(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(1, 0b00, rm, rn, rd); } void sm3partw2(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(1, 0b01, rm, rn, rd); } void sm4ekey(VRegister rd, VRegister rn, VRegister rm) { Crypto3RegSHA512(1, 0b10, rm, rn, rd); } // Cryptographic four-register void eor3(VRegister rd, VRegister rn, VRegister rm, VRegister ra) { Crypto4Register(0b00, rm, ra, rn, rd); } void bcax(VRegister rd, VRegister rn, VRegister rm, VRegister ra) { Crypto4Register(0b01, rm, ra, rn, rd); } void sm3ss1(VRegister rd, VRegister rn, VRegister rm, VRegister ra) { Crypto4Register(0b10, rm, ra, rn, rd); } // Cryptographic two-register SHA 512 void sha512su0(VRegister rd, VRegister rn) { Crypto2RegSHA512(0b00, rn, rd); } void sm4e(VRegister rd, VRegister rn) { Crypto2RegSHA512(0b01, rn, rd); } // Conversion between floating-point and fixed-point void scvtf(ARMEmitter::ScalarRegSize ScalarSize, ARMEmitter::VRegister rd, ARMEmitter::Size GPRSize, ARMEmitter::Register rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(FractionalBits >= 1 && FractionalBits <= ARMEmitter::RegSizeInBits(GPRSize), "Fractional bits out of range"); uint32_t Scale = 64 - FractionalBits; const auto ConvertedSize = ScalarSize == ARMEmitter::ScalarRegSize::i64Bit ? 0b01 : ScalarSize == ARMEmitter::ScalarRegSize::i32Bit ? 0b00 : ScalarSize == ARMEmitter::ScalarRegSize::i16Bit ? 0b11 : 0; ScalarConvertBetweenFPAndFixed(0, 0b00, 0b010, Scale, GPRSize, ConvertedSize, rn, rd); } void ucvtf(ARMEmitter::ScalarRegSize ScalarSize, ARMEmitter::VRegister rd, ARMEmitter::Size GPRSize, ARMEmitter::Register rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(FractionalBits >= 1 && FractionalBits <= ARMEmitter::RegSizeInBits(GPRSize), "Fractional bits out of range"); uint32_t Scale = 64 - FractionalBits; const auto ConvertedSize = ScalarSize == ARMEmitter::ScalarRegSize::i64Bit ? 0b01 : ScalarSize == ARMEmitter::ScalarRegSize::i32Bit ? 0b00 : ScalarSize == ARMEmitter::ScalarRegSize::i16Bit ? 0b11 : 0; ScalarConvertBetweenFPAndFixed(0, 0b00, 0b011, Scale, GPRSize, ConvertedSize, rn, rd); } void fcvtzs(ARMEmitter::Size GPRSize, ARMEmitter::Register rd, ARMEmitter::ScalarRegSize ScalarSize, ARMEmitter::VRegister rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(FractionalBits >= 1 && FractionalBits <= ARMEmitter::RegSizeInBits(GPRSize), "Fractional bits out of range"); uint32_t Scale = 64 - FractionalBits; const auto ConvertedSize = ScalarSize == ARMEmitter::ScalarRegSize::i64Bit ? 0b01 : ScalarSize == ARMEmitter::ScalarRegSize::i32Bit ? 0b00 : ScalarSize == ARMEmitter::ScalarRegSize::i16Bit ? 0b11 : 0; ScalarConvertBetweenFPAndFixed(0, 0b11, 0b000, Scale, GPRSize, ConvertedSize, rn, rd); } void fcvtzu(ARMEmitter::Size GPRSize, ARMEmitter::Register rd, ARMEmitter::ScalarRegSize ScalarSize, ARMEmitter::VRegister rn, uint32_t FractionalBits) { LOGMAN_THROW_A_FMT(FractionalBits >= 1 && FractionalBits <= ARMEmitter::RegSizeInBits(GPRSize), "Fractional bits out of range"); uint32_t Scale = 64 - FractionalBits; const auto ConvertedSize = ScalarSize == ARMEmitter::ScalarRegSize::i64Bit ? 0b01 : ScalarSize == ARMEmitter::ScalarRegSize::i32Bit ? 0b00 : ScalarSize == ARMEmitter::ScalarRegSize::i16Bit ? 0b11 : 0; ScalarConvertBetweenFPAndFixed(0, 0b11, 0b001, Scale, GPRSize, ConvertedSize, rn, rd); } // Conversion between floating-point and integer void fcvtns(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b000, rd, ToReg(rn)); } void fcvtns(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b000, rd, ToReg(rn)); } void fcvtns(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b000, rd, ToReg(rn)); } void fcvtnu(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b001, rd, ToReg(rn)); } void fcvtnu(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b001, rd, ToReg(rn)); } void fcvtnu(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b001, rd, ToReg(rn)); } void scvtf(ARMEmitter::Size size, HRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b010, ToReg(rd), rn); } void scvtf(ARMEmitter::Size size, SRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b010, ToReg(rd), rn); } void scvtf(ARMEmitter::Size size, DRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b010, ToReg(rd), rn); } void ucvtf(ARMEmitter::Size size, HRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b011, ToReg(rd), rn); } void ucvtf(ARMEmitter::Size size, SRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b011, ToReg(rd), rn); } void ucvtf(ARMEmitter::Size size, DRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b011, ToReg(rd), rn); } void fcvtas(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b100, rd, ToReg(rn)); } void fcvtas(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b100, rd, ToReg(rn)); } void fcvtas(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b100, rd, ToReg(rn)); } void fcvtau(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b101, rd, ToReg(rn)); } void fcvtau(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b101, rd, ToReg(rn)); } void fcvtau(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b101, rd, ToReg(rn)); } void fmov(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b110, rd, ToReg(rn)); } void fmov(ARMEmitter::Size size, Register rd, SRegister rn) { LOGMAN_THROW_A_FMT(size != ARMEmitter::Size::i64Bit, "Can't move SReg to 64-bit"); ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b110, rd, ToReg(rn)); } void fmov(ARMEmitter::Size size, Register rd, DRegister rn) { LOGMAN_THROW_A_FMT(size != ARMEmitter::Size::i32Bit, "Can't move DReg to 32-bit"); ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b110, rd, ToReg(rn)); } void fmov(ARMEmitter::Size size, Register rd, VRegister rn, bool Upper) { if (Upper) { LOGMAN_THROW_A_FMT(size == ARMEmitter::Size::i64Bit, "Can only move upper with 64-bit elements"); } ASIMDFloatConvBetweenInt(size, 0, Upper ? 0b10 : 0b01, Upper ? 0b01 : 0b00, 0b110, rd, ToReg(rn)); } void fmov(ARMEmitter::Size size, HRegister rd, Register rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b00, 0b111, ToReg(rd), rn); } void fmov(ARMEmitter::Size size, SRegister rd, Register rn) { LOGMAN_THROW_A_FMT(size != ARMEmitter::Size::i64Bit, "Can't move SReg to 64-bit"); ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b00, 0b111, ToReg(rd), rn); } void fmov(ARMEmitter::Size size, DRegister rd, Register rn) { LOGMAN_THROW_A_FMT(size != ARMEmitter::Size::i32Bit, "Can't move DReg to 32-bit"); ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b00, 0b111, ToReg(rd), rn); } void fmov(ARMEmitter::Size size, VRegister rd, Register rn, bool Upper) { if (Upper) { LOGMAN_THROW_A_FMT(size == ARMEmitter::Size::i64Bit, "Can only move upper with 64-bit elements"); } ASIMDFloatConvBetweenInt(size, 0, Upper ? 0b10 : 0b01, Upper ? 0b01 : 0b00, 0b111, ToReg(rd), rn); } void fcvtps(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b01, 0b000, rd, ToReg(rn)); } void fcvtps(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b01, 0b000, rd, ToReg(rn)); } void fcvtps(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b01, 0b000, rd, ToReg(rn)); } void fcvtpu(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b01, 0b001, rd, ToReg(rn)); } void fcvtpu(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b01, 0b001, rd, ToReg(rn)); } void fcvtpu(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b01, 0b001, rd, ToReg(rn)); } void fcvtms(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b10, 0b000, rd, ToReg(rn)); } void fcvtms(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b10, 0b000, rd, ToReg(rn)); } void fcvtms(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b10, 0b000, rd, ToReg(rn)); } void fcvtmu(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b10, 0b001, rd, ToReg(rn)); } void fcvtmu(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b10, 0b001, rd, ToReg(rn)); } void fcvtmu(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b10, 0b001, rd, ToReg(rn)); } void fcvtzs(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b11, 0b000, rd, ToReg(rn)); } void fcvtzs(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b11, 0b000, rd, ToReg(rn)); } void fcvtzs(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b11, 0b000, rd, ToReg(rn)); } void fcvtzs(ARMEmitter::Size size, Register rd, VRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b11, 0b000, rd, ToReg(rn)); } void fcvtzu(ARMEmitter::Size size, Register rd, HRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b11, 0b11, 0b001, rd, ToReg(rn)); } void fcvtzu(ARMEmitter::Size size, Register rd, SRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b00, 0b11, 0b001, rd, ToReg(rn)); } void fcvtzu(ARMEmitter::Size size, Register rd, DRegister rn) { ASIMDFloatConvBetweenInt(size, 0, 0b01, 0b11, 0b001, rd, ToReg(rn)); } private: // Advanced SIMD three same (FP16) template void ASIMDThreeSameFP16(uint32_t U, uint32_t a, uint32_t opcode, T rm, T rn, T rd) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; constexpr uint32_t Op = 0b0000'1110'0100'0000'0000'01 << 10; uint32_t Instr = Op; Instr |= Q; Instr |= U << 29; Instr |= a << 23; Instr |= rm.Idx() << 16; Instr |= opcode << 11; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Advanced SIMD two-register miscellaneous (FP16) template void ASIMDTwoRegMiscFP16(uint32_t U, uint32_t a, uint32_t opcode, T rn, T rd) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; constexpr uint32_t Op = 0b0000'1110'0111'1000'0000'10 << 10; uint32_t Instr = Op; Instr |= Q; Instr |= U << 29; Instr |= a << 23; Instr |= opcode << 12; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Advanced SIMD three-register extension template void ASIMDThreeRegisterExt(uint32_t U, uint32_t opcode, ARMEmitter::SubRegSize size, T rm, T rn, T rd) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; constexpr uint32_t Op = 0b0000'1110'0000'0000'1000'01 << 10; uint32_t Instr = Op; Instr |= Q; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= rm.Idx() << 16; Instr |= opcode << 11; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Cryptographic AES void CryptoAES(uint32_t opcode, VRegister rd, VRegister rn) { uint32_t Instr = 0b0100'1110'0010'1000'0000'10U << 10; Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Cryptographic three-register SHA void Crypto3RegSHA(uint32_t opcode, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0101'1110'0000'0000'0000'00U << 10; Instr |= Encode_rm(rm); Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Cryptographic two-register SHA void Crypto2RegSHA(uint32_t opcode, VRegister rd, VRegister rn) { uint32_t Instr = 0b0101'1110'0010'1000'0000'10U << 10; Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD table lookup void ASIMDTable(uint32_t Q, uint32_t op2, uint32_t len, uint32_t op, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0000'1110'000U << 21; Instr |= Q << 30; Instr |= op2 << 22; Instr |= Encode_rm(rm); Instr |= len << 13; Instr |= op << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD permute void ASIMDPermute(uint32_t Q, SubRegSize size, uint32_t opcode, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0000'1110'0000'0000'0000'10U << 10; Instr |= Q << 30; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= Encode_rm(rm); Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD extract void ASIMDExtract(uint32_t Q, uint32_t op2, uint32_t imm4, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0010'1110'000U << 21; Instr |= Q << 30; Instr |= op2 << 22; Instr |= Encode_rm(rm); Instr |= imm4 << 11; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD two-register miscellaneous template void ASIMD2RegMisc(uint32_t U, SubRegSize size, uint32_t opcode, T rd, T rn) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = 0b0000'1110'0010'0000'0000'10U << 10; Instr |= Q; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD across lanes template void ASIMDAcrossLanes(uint32_t U, SubRegSize size, uint32_t opcode, T rd, T rn) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = 0b0000'1110'0011'0000'0000'10U << 10; Instr |= Q; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD three different template void ASIMD3Different(uint32_t U, uint32_t opcode, SubRegSize size, T rd, T rn, T rm) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = 0b0000'1110'0010'0000'0000'00U << 10; Instr |= Q; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= Encode_rm(rm); Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD three same template void ASIMD3Same(uint32_t U, SubRegSize size, uint32_t opcode, T rd, T rn, T rm) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = 0b0000'1110'0010'0000'0000'01U << 10; Instr |= Q; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= Encode_rm(rm); Instr |= opcode << 11; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD modified immediate template void ASIMDModifiedImm(uint32_t op, uint32_t cmode, uint32_t o2, uint32_t imm, T rd) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = 0b0000'1111'0000'0000'0000'01U << 10; Instr |= Q; Instr |= op << 29; Instr |= ((imm >> 7) & 1) << 18; Instr |= ((imm >> 6) & 1) << 17; Instr |= ((imm >> 5) & 1) << 16; Instr |= cmode << 12; Instr |= o2 << 11; Instr |= ((imm >> 4) & 1) << 9; Instr |= ((imm >> 3) & 1) << 8; Instr |= ((imm >> 2) & 1) << 7; Instr |= ((imm >> 1) & 1) << 6; Instr |= ((imm >> 0) & 1) << 5; Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD shift by immediate template void ASIMDShiftByImm(uint32_t U, uint32_t immh, uint32_t immb, uint32_t opcode, T rn, T rd) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; LOGMAN_THROW_A_FMT(immh != 0, "ImmH needs to not be zero"); uint32_t Instr = 0b0000'1111'0000'0000'0000'01U << 10; Instr |= Q; Instr |= U << 29; Instr |= immh << 19; Instr |= immb << 16; Instr |= opcode << 11; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD vector x indexed element template void ASIMDVectorXIndexedElement(uint32_t U, uint32_t L, uint32_t M, uint32_t opcode, uint32_t H, ARMEmitter::SubRegSize size, T rm, T rn, T rd) { constexpr uint32_t Op = 0b0000'1111'0000'0000'0000'00 << 10; constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = Op; Instr |= Q; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= L << 21; // M and Rm might overlap. It's up to the instruction emitter itself to ensure there is no conflict. Instr |= M << 20; Instr |= rm.Idx() << 16; Instr |= opcode << 12; Instr |= H << 11; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } void Crypto3RegImm(uint32_t index, uint32_t opcode, VRegister rm, VRegister rn, VRegister rd) { LOGMAN_THROW_A_FMT(index <= 3, "index ({}) must be within [0-3]", index); uint32_t Instr = 0b1100'1110'0100'0000'1000'0000'0000'0000; Instr |= rm.Idx() << 16; Instr |= index << 12; Instr |= opcode << 10; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } void Crypto3RegSHA512(uint32_t o, uint32_t opcode, VRegister rm, VRegister rn, VRegister rd) { uint32_t Instr = 0b1100'1110'0110'0000'1000'0000'0000'0000; Instr |= rm.Idx() << 16; Instr |= o << 14; Instr |= opcode << 10; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } void Crypto4Register(uint32_t opcode, VRegister rm, VRegister ra, VRegister rn, VRegister rd) { uint32_t Instr = 0b1100'1110'0000'0000'0000'0000'0000'0000; Instr |= opcode << 21; Instr |= rm.Idx() << 16; Instr |= ra.Idx() << 10; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } void Crypto2RegSHA512(uint32_t opcode, VRegister rn, VRegister rd) { uint32_t Instr = 0b1100'1110'1100'0000'1000'0000'0000'0000; Instr |= opcode << 10; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Conversion between floating-point and fixed-point template void ScalarConvertBetweenFPAndFixed(uint32_t S, uint32_t rmode, uint32_t opcode, uint32_t scale, ARMEmitter::Size GPRSize, uint32_t ScalarSize, T rn, T2 rd) { constexpr uint32_t Op = 0b0001'1110'000 << 21; const uint32_t SF = GPRSize == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= S << 29; Instr |= ScalarSize << 22; Instr |= rmode << 19; Instr |= opcode << 16; Instr |= scale << 10; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Conversion between floating-point and integer void ASIMDFloatConvBetweenInt(ARMEmitter::Size s, uint32_t S, uint32_t ptype, uint32_t rmode, uint32_t opcode, Register rd, Register rn) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = 0b0001'1110'001U << 21; Instr |= SF; Instr |= S << 29; Instr |= ptype << 22; Instr |= rmode << 19; Instr |= opcode << 16; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } template void ASIMDLoadStoreMultipleStructure(uint32_t Op, uint32_t opcode, T rt, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; uint32_t Instr = Op; Instr |= Q; Instr |= Load ? 1 << 22 : 0; Instr |= Encode_rm(rm); Instr |= opcode; Instr |= FEXCore::ToUnderlying(size) << 10; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } template void ASIMDSTLD(uint32_t Op, uint32_t Opcode, ARMEmitter::VRegister rt, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && Index < 16) || (size == SubRegSize::i16Bit && Index < 8) || (size == SubRegSize::i32Bit && Index < 4) || (size == SubRegSize::i64Bit && Index < 2), "Invalid Index selected"); uint32_t Q {}; uint32_t S {}; uint32_t Size {}; // selem is for determining if we are doing 1-3 loadstore single structure operations // eg: ST1/2/3/4 or LD1/2/3/4 constexpr uint32_t selem = Count - 1; const uint32_t opcode = Opcode | (selem >> 1); // Index is encoded as: // 8-bit: Q:S:size // 16-bit Q:S:size<1> // 32-bit: Q:S // 64-bit: Q if constexpr (size == SubRegSize::i8Bit) { Q = ((Index & 0b1000) >> 3) << 30; S = ((Index & 0b0100) >> 2); Size = Index & 0b11; } else if constexpr (size == SubRegSize::i16Bit) { Q = ((Index & 0b0100) >> 2) << 30; S = ((Index & 0b0010) >> 1); Size = (Index & 0b1) << 1; } else if constexpr (size == SubRegSize::i32Bit) { Q = ((Index & 0b0010) >> 1) << 30; S = Index & 0b0001; } else if constexpr (size == SubRegSize::i64Bit) { Q = (Index & 0b0001) << 30; Size = 1; } // scale = opcode<2:1> // selem = opcode<0>:R + 1 // // scale: // - 0 // - Index = Q:S:size - aka B[0-15] // - 1 // - Index = Q:S:size<1> - aka H[0-7] // - 2 // if (size == i32) // - Index = Q:S - aka S[0-3] // if (size == i64) // - Index = Q - aka D[0-1] // if (size == i128) undefined // - 3 // Load+Replicate // scale = size ASIMDLoadStore(Op | Q, Load, selem & 1, opcode, S, Size, rt, rn, rm); } template void ASIMDSTLD(uint32_t Op, uint32_t Opcode, T rt, ARMEmitter::Register rn, ARMEmitter::Register rm) { constexpr uint32_t Q = std::is_same_v ? 1U << 30 : 0; constexpr uint32_t S = 0; // selem is for determining if we are doing 1-3 loadstore single structure operations // eg: ST1/2/3/4 or LD1/2/3/4 constexpr uint32_t selem = Count - 1; const uint32_t opcode = Opcode | (selem >> 1); // scale = opcode<2:1> // selem = opcode<0>:R + 1 // // scale: // - 0 // - Index = Q:S:size - aka B[0-15] // - 1 // - Index = Q:S:size<1> - aka H[0-7] // - 2 // if (size == i32) // - Index = Q:S - aka S[0-3] // if (size == i64) // - Index = Q - aka D[0-1] // if (size == i128) undefined // - 3 // Load+Replicate // scale = size ASIMDLoadStore(Op | Q, Load, selem & 1, opcode, S, FEXCore::ToUnderlying(size), rt, rn, rm); } void ASIMDLoadStore(uint32_t Op, uint32_t L, uint32_t R, uint32_t opcode, uint32_t S, uint32_t size, ARMEmitter::VRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm) { uint32_t Instr = Op; Instr |= L << 22; Instr |= R << 21; Instr |= Encode_rm(rm); Instr |= opcode << 13; Instr |= S << 12; Instr |= size << 10; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/BranchOps.inl ================================================ // SPDX-License-Identifier: MIT /* Branch instruction emitters. * * Most of these instructions will use `BackwardLabel`, `ForwardLabel`, or `BiDirectionLabel` to determine where a branch targets. */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif public: // Branches, Exception Generating and System instructions public: // Conditional branch immediate ///< Branch conditional void b(ARMEmitter::Condition Cond, uint32_t Imm) { constexpr uint32_t Op = 0b0101'010 << 25; Branch_Conditional(Op, 0, 0, Cond, Imm); } [[nodiscard]] BranchEncodeSucceeded b(ARMEmitter::Condition Cond, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0101'010 << 25; Branch_Conditional(Op, 0, 0, Cond, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded b(ARMEmitter::Condition Cond, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::BC}); constexpr uint32_t Op = 0b0101'010 << 25; Branch_Conditional(Op, 0, 0, Cond, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded b(ARMEmitter::Condition Cond, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return b(Cond, &Label->Backward); } else { return b(Cond, &Label->Forward); } } ///< Branch consistent conditional void bc(ARMEmitter::Condition Cond, uint32_t Imm) { constexpr uint32_t Op = 0b0101'010 << 25; Branch_Conditional(Op, 0, 1, Cond, Imm); } [[nodiscard]] BranchEncodeSucceeded bc(ARMEmitter::Condition Cond, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0101'010 << 25; Branch_Conditional(Op, 0, 1, Cond, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded bc(ARMEmitter::Condition Cond, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::BC}); constexpr uint32_t Op = 0b0101'010 << 25; Branch_Conditional(Op, 0, 1, Cond, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded bc(ARMEmitter::Condition Cond, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return bc(Cond, &Label->Backward); } else { return bc(Cond, &Label->Forward); } } // Unconditional branch register void br(ARMEmitter::Register rn) { constexpr uint32_t Op = 0b1101011 << 25 | 0b0'000 << 21 | // opc 0b1'1111 << 16 | // op2 0b0000'00 << 10 | // op3 0b0'0000; // op4 UnconditionalBranch(Op, rn); } void blr(ARMEmitter::Register rn) { constexpr uint32_t Op = 0b1101011 << 25 | 0b0'001 << 21 | // opc 0b1'1111 << 16 | // op2 0b0000'00 << 10 | // op3 0b0'0000; // op4 UnconditionalBranch(Op, rn); } void ret(ARMEmitter::Register rn = ARMEmitter::Reg::r30) { constexpr uint32_t Op = 0b1101011 << 25 | 0b0'010 << 21 | // opc 0b1'1111 << 16 | // op2 0b0000'00 << 10 | // op3 0b0'0000; // op4 UnconditionalBranch(Op, rn); } // Unconditional branch immediate void b(uint32_t Imm) { constexpr uint32_t Op = 0b0001'01 << 26; UnconditionalBranch(Op, Imm); } [[nodiscard]] BranchEncodeSucceeded b(const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -134217728 && Imm <= 134217724 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0001'01 << 26; UnconditionalBranch(Op, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded b(ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::B}); constexpr uint32_t Op = 0b0001'01 << 26; UnconditionalBranch(Op, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded b(BiDirectionalLabel* Label) { if (Label->Backward.Location) { return b(&Label->Backward); } else { return b(&Label->Forward); } } void bl(uint32_t Imm) { constexpr uint32_t Op = 0b1001'01 << 26; UnconditionalBranch(Op, Imm); } [[nodiscard]] BranchEncodeSucceeded bl(const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -134217728 && Imm <= 134217724 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b1001'01 << 26; UnconditionalBranch(Op, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded bl(ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::B}); constexpr uint32_t Op = 0b1001'01 << 26; UnconditionalBranch(Op, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded bl(BiDirectionalLabel* Label) { if (Label->Backward.Location) { return bl(&Label->Backward); } else { return bl(&Label->Forward); } } // Compare and branch void cbz(ARMEmitter::Size s, ARMEmitter::Register rt, uint32_t Imm) { constexpr uint32_t Op = 0b0011'0100 << 24; CompareAndBranch(Op, s, rt, Imm); } [[nodiscard]] BranchEncodeSucceeded cbz(ARMEmitter::Size s, ARMEmitter::Register rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0011'0100 << 24; CompareAndBranch(Op, s, rt, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded cbz(ARMEmitter::Size s, ARMEmitter::Register rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::BC}); constexpr uint32_t Op = 0b0011'0100 << 24; CompareAndBranch(Op, s, rt, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded cbz(ARMEmitter::Size s, ARMEmitter::Register rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return cbz(s, rt, &Label->Backward); } else { return cbz(s, rt, &Label->Forward); } } void cbnz(ARMEmitter::Size s, ARMEmitter::Register rt, uint32_t Imm) { constexpr uint32_t Op = 0b0011'0101 << 24; CompareAndBranch(Op, s, rt, Imm); } [[nodiscard]] BranchEncodeSucceeded cbnz(ARMEmitter::Size s, ARMEmitter::Register rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0011'0101 << 24; CompareAndBranch(Op, s, rt, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded cbnz(ARMEmitter::Size s, ARMEmitter::Register rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::BC}); constexpr uint32_t Op = 0b0011'0101 << 24; CompareAndBranch(Op, s, rt, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded cbnz(ARMEmitter::Size s, ARMEmitter::Register rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return cbnz(s, rt, &Label->Backward); } else { return cbnz(s, rt, &Label->Forward); } } // Test and branch immediate void tbz(ARMEmitter::Register rt, uint32_t Bit, uint32_t Imm) { constexpr uint32_t Op = 0b0011'0110 << 24; TestAndBranch(Op, rt, Bit, Imm); } [[nodiscard]] BranchEncodeSucceeded tbz(ARMEmitter::Register rt, uint32_t Bit, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -32768 && Imm <= 32764 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0011'0110 << 24; TestAndBranch(Op, rt, Bit, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded tbz(ARMEmitter::Register rt, uint32_t Bit, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::TEST_BRANCH}); constexpr uint32_t Op = 0b0011'0110 << 24; TestAndBranch(Op, rt, Bit, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded tbz(ARMEmitter::Register rt, uint32_t Bit, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return tbz(rt, Bit, &Label->Backward); } else { return tbz(rt, Bit, &Label->Forward); } } void tbnz(ARMEmitter::Register rt, uint32_t Bit, uint32_t Imm) { constexpr uint32_t Op = 0b0011'0111 << 24; TestAndBranch(Op, rt, Bit, Imm); } [[nodiscard]] BranchEncodeSucceeded tbnz(ARMEmitter::Register rt, uint32_t Bit, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); if (Imm >= -32768 && Imm <= 32764 && ((Imm & 0b11) == 0)) { constexpr uint32_t Op = 0b0011'0111 << 24; TestAndBranch(Op, rt, Bit, Imm >> 2); return BranchEncodeSucceeded::Success; } // Can't encode. return BranchEncodeSucceeded::Failure; } [[nodiscard]] BranchEncodeSucceeded tbnz(ARMEmitter::Register rt, uint32_t Bit, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::TEST_BRANCH}); constexpr uint32_t Op = 0b0011'0111 << 24; TestAndBranch(Op, rt, Bit, 0); // Forward label doesn't know if it can encode until Bind. return BranchEncodeSucceeded::Success; } [[nodiscard]] BranchEncodeSucceeded tbnz(ARMEmitter::Register rt, uint32_t Bit, BiDirectionalLabel* Label) { if (Label->Backward.Location) { return tbnz(rt, Bit, &Label->Backward); } else { return tbnz(rt, Bit, &Label->Forward); } } private: // Conditional branch immediate void Branch_Conditional(uint32_t Op, uint32_t Op1, uint32_t Op0, ARMEmitter::Condition Cond, uint32_t Imm) { uint32_t Instr = Op; Instr |= Op1 << 24; Instr |= (Imm & 0x7'FFFF) << 5; Instr |= Op0 << 4; Instr |= FEXCore::ToUnderlying(Cond); dc32(Instr); } // Unconditional branch register void UnconditionalBranch(uint32_t Op, ARMEmitter::Register rn) { uint32_t Instr = Op; Instr |= Encode_rn(rn); dc32(Instr); } // Unconditional branch - immediate void UnconditionalBranch(uint32_t Op, uint32_t Imm) { uint32_t Instr = Op; Instr |= Imm & 0x3FF'FFFF; dc32(Instr); } // Compare and branch void CompareAndBranch(uint32_t Op, ARMEmitter::Size s, ARMEmitter::Register rt, uint32_t Imm) { const uint32_t SF = s == ARMEmitter::Size::i64Bit ? (1U << 31) : 0; uint32_t Instr = Op; Instr |= SF; Instr |= (Imm & 0x7'FFFF) << 5; Instr |= Encode_rt(rt); dc32(Instr); } // Test and branch - immediate void TestAndBranch(uint32_t Op, ARMEmitter::Register rt, uint32_t Bit, uint32_t Imm) { uint32_t Instr = Op; Instr |= (Bit >> 5) << 31; Instr |= (Bit & 0b1'1111) << 19; Instr |= (Imm & 0x3FFF) << 5; Instr |= Encode_rt(rt); dc32(Instr); } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/Buffer.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include namespace ARMEmitter { class Buffer { public: Buffer() { SetBuffer(nullptr, 0); } Buffer(uint8_t* Base, uint64_t BaseSize) { SetBuffer(Base, BaseSize); } void SetBuffer(uint8_t* Base, uint64_t BaseSize) { BufferBase = Base; CurrentOffset = BufferBase; Size = BaseSize; } template requires (std::is_trivially_copyable_v) void dcn(const T& Data) { std::memcpy(CurrentOffset, &Data, sizeof(Data)); CurrentOffset += sizeof(Data); } void dc8(uint8_t Data) { dcn(Data); } void dc16(uint16_t Data) { dcn(Data); } void dc32(uint32_t Data) { dcn(Data); } void dc64(uint64_t Data) { dcn(Data); } void EmitString(const char* String) { const auto StringLength = strlen(String); memcpy(CurrentOffset, String, StringLength); CurrentOffset += StringLength; } void Align(size_t Size = 4) { // Align the buffer to provided size. auto CurrentAlignment = reinterpret_cast(CurrentOffset) & (Size - 1); if (!CurrentAlignment) { return; } std::memset(CurrentOffset, 0, Size - CurrentAlignment); CurrentOffset += Size - CurrentAlignment; } template T GetCursorAddress() const { return reinterpret_cast(CurrentOffset); } static void ClearICache(void* Begin, std::size_t Length) { __builtin___clear_cache(static_cast(Begin), static_cast(Begin) + Length); } size_t GetCursorOffset() const { return static_cast(CurrentOffset - BufferBase); } uint8_t* GetBufferBase() const { return BufferBase; } void CursorIncrement(size_t Size) { CurrentOffset += Size; } void SetCursorOffset(size_t Offset) { CurrentOffset = BufferBase + Offset; } uint64_t GetBufferSize() const { return Size; } template size_t GetCursorOffsetFromAddress(const T* Address) const { return static_cast(reinterpret_cast(Address) - BufferBase); } protected: uint8_t* BufferBase; uint8_t* CurrentOffset; uint64_t Size; }; } // namespace ARMEmitter ================================================ FILE: CodeEmitter/CodeEmitter/Emitter.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Welcome to FEX-Emu's custom AArch64 emitter. * This was written specifically to avoid the performance cost of the vixl emitter. * * There are some specific design constraints in this design to target a couple features: * - High performance * - Low CPU cache performance hit * - Significantly reduced code footprint * - Low number of branches * * These requirements are mostly achieved by removing a bunch of developer conveniences * that vixl provides. The developer needs to take a lot of care to not shoot themselves in the foot. * * Misc design decisions: * - Registers are encoded as basic uint32_t enums. * - Converting between different registers is zero-cost. * - Passing around as arguments are as cheap as registers * - Contrast to vixl where every register requires living on the stack. * - Registers can get encoded in to instructions with a simple `BFM` instruction. * * - Instructions are very simply emitted, allowing direct inlining most of the time. * - These are simple enough that multiple back-to-back instructions get optimized to 128-bit load-store operations. * - Contrast to vixl where pretty much no instruction emitter gets inlined. * * - Instruction emitters are /mostly/ unsized. Most instructions take a size argument first, which gets encoded * directly in to the instruction. * - Contrast to vixl where the register arguments are how the instructions determine operating size. * - Size argument allows FEX to use `CSEL` to select a size at runtime, instead of branching. * - Some instructions are explicitly sized based on register type. Read comments in the respective `inl` files to * see why. * Some scalar/vector operations are an example of this. * * - Almost zero helper functions. * - Primary exception to this rule is load-store operations. These will use a helper to make * it easier to select the correct load-store instruction. Mostly because these are a nightmare selecting * the right instruction. */ namespace ARMEmitter { /* * This `Size` enum is used for most ALU operations. * These follow the AArch64 encoding style in most cases. */ enum class Size : uint32_t { i32Bit = 0, i64Bit, }; // This allows us to get the `Size` enum in bits. [[nodiscard]] constexpr size_t RegSizeInBits(Size size) { return size_t {32} << FEXCore::ToUnderlying(size); } /* This `SubRegSize` enum is used for most ASIMD operations. * These follow the AArch64 encoding style in most cases. */ enum class SubRegSize : uint32_t { i8Bit = 0b00, i16Bit = 0b01, i32Bit = 0b10, i64Bit = 0b11, i128Bit = 0b100, }; // This allows us to get the `SubRegSize` in bits. [[nodiscard]] constexpr size_t SubRegSizeInBits(SubRegSize size) { return size_t {8} << FEXCore::ToUnderlying(size); } // Many floating point operations constrain their element sizes to the // main three float sizes half, single, and double precision. This just // combines all the checks together for brevity. [[nodiscard]] constexpr bool IsStandardFloatSize(SubRegSize size) { return size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit; } /* This `ScalarRegSize` enum is used for most scalar float * operations. * * This is specifically duplicated from `SubRegSize` to have strongly * typed functions. * * `ScalarRegSize` specifically doesn't have `i128Bit` because scalar operations * can't operate at 128-bit. */ enum class ScalarRegSize : uint32_t { i8Bit = 0b00, i16Bit = 0b01, i32Bit = 0b10, i64Bit = 0b11, }; // This allows us to get the `ScalarRegSize` in bits. [[nodiscard]] constexpr size_t ScalarRegSizeInBits(ScalarRegSize size) { return size_t {8} << FEXCore::ToUnderlying(size); } /* This `VectorRegSizePair` union allows us to have an overlapping type * to select a scalar operation or a vector depending on which operation * we pass in. * Useful in FEX's vector operations that behave as scalar or vector * depending on various factors. But since the operation will have the sa,e * element size, we want to choose the operation more easily */ union VectorRegSizePair { ScalarRegSize Scalar; SubRegSize Vector; }; // This allows us to create a `VectorRegSizePair` union. [[nodiscard]] constexpr VectorRegSizePair ToVectorSizePair(SubRegSize size) { return VectorRegSizePair {.Vector = size}; } [[nodiscard]] constexpr VectorRegSizePair ToVectorSizePair(ScalarRegSize size) { return VectorRegSizePair {.Scalar = size}; } // This `ShiftType` enum is used for ALU shift-register encoded instructions. enum class ShiftType : uint32_t { LSL = 0, LSR, ASR, ROR, }; // This `ExtendedType` enum is used for ALU extended-register encoded instructions. enum class ExtendedType : uint32_t { UXTB = 0b000, UXTH = 0b001, UXTW = 0b010, UXTX = 0b011, SXTB = 0b100, SXTH = 0b101, SXTW = 0b110, SXTX = 0b111, LSL_32 = UXTW, LSL_64 = UXTX, }; // This `Condition` enum is used for various conditional instructions. enum class Condition : uint32_t { // Meaning: Int - Float CC_EQ = 0, // Equal - Equal CC_NE, // Not Eq - Not Eq or unordered CC_CS, // Carry set - Greater than, equal, or unordered CC_CC, // Carry clear - Less than CC_MI, // Minus/Negative - Less than CC_PL, // Plus, positive or zero - GT, equal, or unordered CC_VS, // Overflow - Unordered CC_VC, // No Overflow - Ordered CC_HI, // Unsigned higher - GT, or unordered CC_LS, // Unsigned lower or same - LT or EQ CC_GE, // Signed GT or EQ - GT or EQ CC_LT, // Signed LT - LT or Unordered CC_GT, // Signed GT - GT CC_LE, // Signed LT or EQ - LT, EQ, or Unordered CC_AL, // Always - Always CC_NV, // Always - Always // Aliases CC_HS = CC_CS, CC_LO = CC_CC, }; /* * This `StatusFlags` enum is used for conditional compare encoded instructions. * These directly encode to the `nzcv` flags. */ enum class StatusFlags : uint32_t { None = 0, Flag_V = 0b0001, Flag_C = 0b0010, Flag_Z = 0b0100, Flag_N = 0b1000, Flag_NZCV = Flag_N | Flag_Z | Flag_C | Flag_V, }; /* * This `IndexType` enum is used for load-store instructions. * Not all load-store instructions use this, so the user needs to be careful. */ enum class IndexType { POST, OFFSET, PRE, UNPRIVILEGED, }; // Used with adr and scalar + vector load/store variants to denote // a modifier operation. enum class SVEModType : uint8_t { MOD_UXTW, MOD_SXTW, MOD_LSL, MOD_NONE, }; /* This `SVEMemOperand` class is used for the helper SVE load-store instructions. * Load-store instructions are quite expressive, so having a helper that handles these differences is worth it. */ class SVEMemOperand final { public: enum class Type { ScalarPlusScalar, ScalarPlusImm, ScalarPlusVector, VectorPlusImm, }; SVEMemOperand(XRegister rn, XRegister rm = XReg::zr) : rn {rn} , MemType {Type::ScalarPlusScalar} , MetaType {.ScalarScalarType { .rm = rm, }} {} SVEMemOperand(XRegister rn, int32_t imm = 0) : rn {rn} , MemType {Type::ScalarPlusImm} , MetaType {.ScalarImmType { .Imm = imm, }} {} SVEMemOperand(XRegister rn, ZRegister zm, SVEModType mod = SVEModType::MOD_NONE, uint8_t scale = 0) : rn {rn} , MemType {Type::ScalarPlusVector} , MetaType {.ScalarVectorType { .zm = zm, .mod = mod, .scale = scale, }} {} SVEMemOperand(ZRegister zn, uint32_t imm) : rn {Register {zn.Idx()}} , MemType {Type::VectorPlusImm} , MetaType {.VectorImmType { .Imm = imm, }} {} [[nodiscard]] bool IsScalarPlusScalar() const { return MemType == Type::ScalarPlusScalar; } [[nodiscard]] bool IsScalarPlusImm() const { return MemType == Type::ScalarPlusImm; } [[nodiscard]] bool IsScalarPlusVector() const { return MemType == Type::ScalarPlusVector; } [[nodiscard]] bool IsVectorPlusImm() const { return MemType == Type::VectorPlusImm; } union Data { struct { Register rm; } ScalarScalarType; struct { int32_t Imm; } ScalarImmType; struct { ZRegister zm; SVEModType mod; uint8_t scale; } ScalarVectorType; struct { // rn will be a ZRegister uint32_t Imm; } VectorImmType; }; Register rn; Type MemType; Data MetaType; }; /* This `ExtendedMemOperand` class is used for the helper load-store instructions. * Load-store instructions are quite expressive, so having a helper that handles these differences is worth it. */ class ExtendedMemOperand final { public: ExtendedMemOperand(XRegister rn, XRegister rm = XReg::zr, ExtendedType Option = ExtendedType::LSL_64, uint32_t Shift = 0) : rn {rn} , MetaType {.Extended { .Header = {.MemType = TYPE_EXTENDED}, .rm = rm, .Option = Option, .Shift = Shift, }} {} ExtendedMemOperand(XRegister rn, IndexType Index = IndexType::OFFSET, int32_t Imm = 0) : rn {rn} , MetaType {.ImmType { .Header = {.MemType = TYPE_IMM}, .Index = Index, .Imm = Imm, }} {} Register rn; enum Type { TYPE_EXTENDED, TYPE_IMM, }; struct HeaderStruct { Type MemType; }; union { HeaderStruct Header; struct { HeaderStruct Header; Register rm; ExtendedType Option; uint32_t Shift; } Extended; struct { HeaderStruct Header; IndexType Index; int32_t Imm; } ImmType; } MetaType; }; template inline constexpr uint32_t GenSystemReg = op0 << 19 | op1 << 16 | CRn << 12 | CRm << 8 | op2 << 5; // This `SystemRegister` enum is used for the mrs/msr instructions. enum class SystemRegister : uint32_t { CTR_EL0 = GenSystemReg<0b11, 0b011, 0b0000, 0b0000, 0b001>, DCZID_EL0 = GenSystemReg<0b11, 0b011, 0b0000, 0b0000, 0b111>, TPIDR_EL0 = GenSystemReg<0b11, 0b011, 0b1101, 0b0000, 0b010>, RNDR = GenSystemReg<0b11, 0b011, 0b0010, 0b0100, 0b000>, RNDRRS = GenSystemReg<0b11, 0b011, 0b0010, 0b0100, 0b001>, NZCV = GenSystemReg<0b11, 0b011, 0b0100, 0b0010, 0b000>, FPCR = GenSystemReg<0b11, 0b011, 0b0100, 0b0100, 0b000>, TPIDRRO_EL0 = GenSystemReg<0b11, 0b011, 0b1101, 0b0000, 0b011>, CNTFRQ_EL0 = GenSystemReg<0b11, 0b011, 0b1110, 0b0000, 0b000>, CNTVCT_EL0 = GenSystemReg<0b11, 0b011, 0b1110, 0b0000, 0b010>, CNTVCTSS_EL0 = GenSystemReg<0b11, 0b011, 0b1110, 0b0000, 0b110>, }; template inline constexpr uint32_t GenDCReg = op1 << 16 | CRm << 8 | op2 << 5; // This `DataCacheOperation` enum is used for the dc instruction. enum class DataCacheOperation : uint32_t { IVAC = GenDCReg<0b000, 0b0110, 0b001>, ISW = GenDCReg<0b000, 0b0110, 0b010>, CSW = GenDCReg<0b000, 0b1010, 0b010>, CISW = GenDCReg<0b000, 0b1110, 0b010>, ZVA = GenDCReg<0b011, 0b0100, 0b001>, CVAC = GenDCReg<0b011, 0b1010, 0b001>, CVAU = GenDCReg<0b011, 0b1011, 0b001>, CIVAC = GenDCReg<0b011, 0b1110, 0b001>, // MTE2 IGVAC = GenDCReg<0b000, 0b0110, 0b011>, IGSW = GenDCReg<0b000, 0b0110, 0b100>, IGDVAC = GenDCReg<0b000, 0b0110, 0b101>, IGDSW = GenDCReg<0b000, 0b0110, 0b110>, CGSW = GenDCReg<0b000, 0b1010, 0b100>, CGDSW = GenDCReg<0b000, 0b1010, 0b110>, CIGSW = GenDCReg<0b000, 0b1110, 0b100>, CIGDSW = GenDCReg<0b000, 0b1110, 0b110>, // MTE GVA = GenDCReg<0b011, 0b0100, 0b011>, GZVA = GenDCReg<0b011, 0b0100, 0b100>, CGVAC = GenDCReg<0b011, 0b1010, 0b011>, CGDVAC = GenDCReg<0b011, 0b1010, 0b101>, CGVAP = GenDCReg<0b011, 0b1100, 0b011>, CGDVAP = GenDCReg<0b011, 0b1100, 0b101>, CGVADP = GenDCReg<0b011, 0b1101, 0b011>, CGDVADP = GenDCReg<0b011, 0b1101, 0b101>, CIGVAC = GenDCReg<0b011, 0b1110, 0b011>, CIGDVAC = GenDCReg<0b011, 0b1110, 0b101>, // DPB CVAP = GenDCReg<0b011, 0b1100, 0b001>, // DPB2 CVADP = GenDCReg<0b011, 0b1101, 0b001>, }; template inline constexpr uint32_t GenHintBarrierReg = CRm << 8 | op2 << 5; // This `HintRegister` enum is used for the hint instruction. enum class HintRegister : uint32_t { NOP = GenHintBarrierReg<0b0000, 0b000>, YIELD = GenHintBarrierReg<0b0000, 0b001>, WFE = GenHintBarrierReg<0b0000, 0b010>, WFI = GenHintBarrierReg<0b0000, 0b011>, SEV = GenHintBarrierReg<0b0000, 0b100>, SEVL = GenHintBarrierReg<0b0000, 0b101>, DGH = GenHintBarrierReg<0b0000, 0b110>, CSDB = GenHintBarrierReg<0b0010, 0b100>, }; // This `BarrierRegister` enum is used for the various barrier instructions. enum class BarrierRegister : uint32_t { CLREX = GenHintBarrierReg<0b0000, 0b010>, TCOMMIT = GenHintBarrierReg<0b0000, 0b011>, DSB = GenHintBarrierReg<0b0000, 0b100>, DMB = GenHintBarrierReg<0b0000, 0b101>, ISB = GenHintBarrierReg<0b0000, 0b110>, SB = GenHintBarrierReg<0b0000, 0b111>, }; // This `BarrierScope` enum is used for the dsb/dmb instructions. enum class BarrierScope : uint32_t { // Outer shareable OSHLD = 0b0001, OSHST = 0b0010, OSH = 0b0011, // Non shareable NSHLD = 0b0101, NSHST = 0b0110, NSH = 0b0111, // Inner shareable ISHLD = 0b1001, ISHST = 0b1010, ISH = 0b1011, // Full System visibility LD = 0b1101, ST = 0b1110, SY = 0b1111, }; // This `Prefetch` enum is used for prefetch instructions. enum class Prefetch : uint32_t { // Prefetch for load PLDL1KEEP = 0b00000, PLDL1STRM = 0b00001, PLDL2KEEP = 0b00010, PLDL2STRM = 0b00011, PLDL3KEEP = 0b00100, PLDL3STRM = 0b00101, // Preload instructions PLIL1KEEP = 0b01000, PLIL1STRM = 0b01001, PLIL2KEEP = 0b01010, PLIL2STRM = 0b01011, PLIL3KEEP = 0b01100, PLIL3STRM = 0b01101, // Preload for store PSTL1KEEP = 0b10000, PSTL1STRM = 0b10001, PSTL2KEEP = 0b10010, PSTL2STRM = 0b10011, PSTL3KEEP = 0b10100, PSTL3STRM = 0b10101, }; // This `PredicatePattern` enun is used for some SVE instructions. enum class PredicatePattern : uint32_t { SVE_POW2 = 0b00000, SVE_VL1 = 0b00001, SVE_VL2 = 0b00010, SVE_VL3 = 0b00011, SVE_VL4 = 0b00100, SVE_VL5 = 0b00101, SVE_VL6 = 0b00110, SVE_VL7 = 0b00111, SVE_VL8 = 0b01000, SVE_VL16 = 0b01001, SVE_VL32 = 0b01010, SVE_VL64 = 0b01011, SVE_VL128 = 0b01100, SVE_VL256 = 0b01101, SVE_MUL4 = 0b11101, SVE_MUL3 = 0b11110, SVE_ALL = 0b11111, }; // Used with SVE FP immediate arithmetic instructions enum class SVEFAddSubImm : uint32_t { _0_5, _1_0, }; enum class SVEFMulImm : uint32_t { _0_5, _2_0, }; enum class SVEFMaxMinImm : uint32_t { _0_0, _1_0, }; /* This `BackwardLabel` struct is used for retaining a location for PC-Relative instructions. * This is specifically a label for a target that is logically `below` an instruction that uses it. * Which means that a branch would jump backwards. */ struct BackwardLabel { uint8_t* Location {}; }; /* This `ForwardLabel` struct is used for retaining a location for PC-Relative instructions. * This is specifically a label for a target that is logically `above` an instruction that uses it. * Which means that a branch would jump forwards. */ struct ForwardLabel { enum class InstType { UNKNOWN, ADR, ADRP, B, BC, TEST_BRANCH, RELATIVE_LOAD, LONG_ADDRESS_GEN, }; struct Reference { uint8_t* Location {}; InstType Type = InstType::UNKNOWN; }; // The first element is stored separately to avoid allocations for simple cases Reference FirstInst; fextl::vector Insts; }; /* This `BiDirectionalLabel` struct used for retaining a location for PC-Relative instructions. * This is specifically a label for a target that is in either direction of an instruction that uses it. * Which means a branch could jump backwards or forwards depending on situation. */ struct BiDirectionalLabel { BackwardLabel Backward; ForwardLabel Forward; }; static inline void AddLocationToLabel(ForwardLabel* Label, ForwardLabel::Reference&& Location) { if (Label->FirstInst.Location == nullptr) { Label->FirstInst = Location; } else { Label->Insts.push_back(Location); } } // Some FCMA ASIMD instructions support a rotation argument. enum class Rotation : uint32_t { ROTATE_0 = 0b00, ROTATE_90 = 0b01, ROTATE_180 = 0b10, ROTATE_270 = 0b11, }; // Concept for contraining some instructions to accept only an XRegister or WRegister. // Particularly for operations that differ encodings depending on which one is used. template concept IsXOrWRegister = std::is_same_v || std::is_same_v; // Concept for contraining some instructions to accept only a QRegister or DRegister. template concept IsQOrDRegister = std::is_same_v || std::is_same_v; template concept IsLabel = std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; enum class BranchEncodeSucceeded { Success, Failure, }; // Whether or not a given set of vector registers are sequential // in increasing order as far as the register file is concerned (modulo its size) // // For example, a set of registers like: // // v1, v2, v3 and // v31, v0, v1 // // would both be considered sequential sequences, and some instructions in particular // limit register lists to these kind of sequences. // template constexpr bool AreVectorsSequential(T first, const Args&... args) { // Ensure we always have a pair of registers to compare against. static_assert(sizeof...(args) >= 1, "Number of arguments must be greater than 1"); const auto fn = [](auto& lhs, const auto& rhs) { const auto result = ((lhs.Idx() + 1) % 32) == rhs.Idx(); lhs = rhs; return result; }; return (fn(first, args) && ...); } // Returns if the immediate can fit in to add/sub immediate instruction encodings. constexpr bool IsImmAddSub(uint64_t imm) { constexpr uint64_t U12Mask = 0xFFF; auto FitsWithin12Bits = [](uint64_t imm) { return (imm & ~U12Mask) == 0; }; // Can fit in to the instruction encoding: // - if only bits [11:0] are set. // - if only bits [23:12] are set. return FitsWithin12Bits(imm) || (FitsWithin12Bits(imm >> 12) && (imm & U12Mask) == 0); } // This is an emitter that is designed around the smallest code bloat as possible. // Eschewing most developer convenience in order to keep code as small as possible. // Choices: // - Size of ops passed as an argument rather than template to let the compiler use csel instead of branching. // - Registers are unsized so they can be passed in a GPR and not need conversion operations class Emitter : public ARMEmitter::Buffer { public: Emitter() = default; Emitter(uint8_t* Base, uint64_t BaseSize) : Buffer(Base, BaseSize) {} // Bind a backward label to an address. // Address that is bound is the current emitter location. [[nodiscard]] bool Bind(BackwardLabel* Label) { LOGMAN_THROW_A_FMT(Label->Location == nullptr, "Trying to bind a label twice"); Label->Location = GetCursorAddress(); // Always binds because it is only storing a location. return true; } [[nodiscard]] bool Bind(const ForwardLabel::Reference* Label) { uint8_t* CurrentAddress = GetCursorAddress(); // Patch up the instructions switch (Label->Type) { case ForwardLabel::InstType::ADR: { uint32_t* Instruction = reinterpret_cast(Label->Location); int64_t Imm = reinterpret_cast(CurrentAddress) - reinterpret_cast(Instruction); if (!IsADRRange(Imm)) { // Can't bind. return false; } uint32_t InstMask = 0b11 << 29 | 0b1111'1111'1111'1111'111 << 5; uint32_t Offset = static_cast(Imm) & 0x3F'FFFF; uint32_t Inst = *Instruction & ~InstMask; Inst |= (Offset & 0b11) << 29; Inst |= (Offset >> 2) << 5; *Instruction = Inst; break; } case ForwardLabel::InstType::ADRP: { uint32_t* Instruction = reinterpret_cast(Label->Location); int64_t Imm = reinterpret_cast(CurrentAddress) - reinterpret_cast(Instruction); if (!(IsADRPRange(Imm) && IsADRPAligned(Imm))) { // Can't bind. return false; } Imm >>= 12; uint32_t InstMask = 0b11 << 29 | 0b1111'1111'1111'1111'111 << 5; uint32_t Offset = static_cast(Imm) & 0x3F'FFFF; uint32_t Inst = *Instruction & ~InstMask; Inst |= (Offset & 0b11) << 29; Inst |= (Offset >> 2) << 5; *Instruction = Inst; break; } case ForwardLabel::InstType::B: { uint32_t* Instruction = reinterpret_cast(Label->Location); int64_t Imm = reinterpret_cast(CurrentAddress) - reinterpret_cast(Instruction); if (!(Imm >= -134217728 && Imm <= 134217724 && ((Imm & 0b11) == 0))) { // Can't bind. return false; } Imm >>= 2; uint32_t InstMask = 0x3FF'FFFF; uint32_t Offset = static_cast(Imm) & InstMask; uint32_t Inst = *Instruction & ~InstMask; Inst |= Offset; *Instruction = Inst; break; } case ForwardLabel::InstType::TEST_BRANCH: { uint32_t* Instruction = reinterpret_cast(Label->Location); int64_t Imm = reinterpret_cast(CurrentAddress) - reinterpret_cast(Instruction); if (!(Imm >= -32768 && Imm <= 32764 && ((Imm & 0b11) == 0))) { // Can't bind. return false; } Imm >>= 2; uint32_t InstMask = 0x3FFF; uint32_t Offset = static_cast(Imm) & InstMask; uint32_t Inst = *Instruction & ~(InstMask << 5); Inst |= Offset << 5; *Instruction = Inst; break; } case ForwardLabel::InstType::BC: case ForwardLabel::InstType::RELATIVE_LOAD: { uint32_t* Instruction = reinterpret_cast(Label->Location); int64_t Imm = reinterpret_cast(CurrentAddress) - reinterpret_cast(Instruction); if (!(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0))) { // Can't bind. return false; } Imm >>= 2; uint32_t InstMask = 0x7'FFFF; uint32_t Offset = static_cast(Imm) & InstMask; uint32_t Inst = *Instruction & ~(InstMask << 5); Inst |= Offset << 5; *Instruction = Inst; break; } case ForwardLabel::InstType::LONG_ADDRESS_GEN: { const auto* Instructions = reinterpret_cast(Label->Location); const auto ImmInstOne = reinterpret_cast(CurrentAddress) - reinterpret_cast(&Instructions[0]); const auto ImmInstTwo = reinterpret_cast(CurrentAddress) - reinterpret_cast(&Instructions[1]); const auto ImmInstThree = reinterpret_cast(CurrentAddress) - reinterpret_cast(&Instructions[2]); const auto OriginalOffset = GetCursorOffset(); const auto InstOffset = GetCursorOffsetFromAddress(Instructions); SetCursorOffset(InstOffset); // We encoded the destination register in to the first instruction space. // Read it back. ARMEmitter::Register DestReg(Instructions[0]); if (IsADRRange(ImmInstThree)) { // If within ADR range from the third instruction, then we can emit NOP+NOP+ADR nop(); nop(); adr(DestReg, static_cast(ImmInstThree) & 0x7FFF); } else if (IsADRPRange(ImmInstTwo)) { // If within ADRP range from the first instruction, then we are /definitely/ in range for the second instruction. // First check if we are in non-offset range for second instruction. if (IsADRPAligned(reinterpret_cast(CurrentAddress))) { // We can emit nop + nop + adrp nop(); nop(); adrp(DestReg, static_cast(ImmInstThree >> 12) & 0x7FFF); } else { // Not aligned, need nop + adrp + add nop(); adrp(DestReg, static_cast(ImmInstTwo >> 12) & 0x7FFF); add(ARMEmitter::Size::i64Bit, DestReg, DestReg, ImmInstTwo & 0xFFF); } } else { // Stinky path, we need to emit a movz+movk+movk sequence. movz(ARMEmitter::Size::i64Bit, DestReg, uint32_t(ImmInstOne >> 32) & 0x7FFF, 32); movk(ARMEmitter::Size::i64Bit, DestReg, uint32_t(ImmInstOne >> 16) & 0xFFFF, 16); movk(ARMEmitter::Size::i64Bit, DestReg, uint32_t(ImmInstOne) & 0xFFFF); } SetCursorOffset(OriginalOffset); break; } default: LOGMAN_MSG_A_FMT("Unexpected inst type in label fixup"); } return true; } // Bind a forward label to a location. // This walks all the instructions in the label's vector. // Then backpatching all instructions that have used the label. [[nodiscard]] bool Bind(ForwardLabel* Label) { bool Bound = true; if (Label->FirstInst.Location) { Bound &= Bind(&Label->FirstInst); } for (auto& Inst : Label->Insts) { Bound &= Bind(&Inst); } return Bound; } // Bind a bidirectional location to a location. // Binds both forwards and backwards depending on how the label was used. [[nodiscard]] bool Bind(BiDirectionalLabel* Label) { bool Bound = true; if (!Label->Backward.Location) { Bound &= Bind(&Label->Backward); } Bound &= Bind(&Label->Forward); return Bound; } static constexpr Condition InvertCondition(Condition cond) { // These behave as always, so it makes no sense to allow inverting these. LOGMAN_THROW_A_FMT(cond != Condition::CC_AL && cond != Condition::CC_NV, "Cannot invert CC_AL or CC_NV"); return static_cast(FEXCore::ToUnderlying(cond) ^ 1); } #include public: // This symbol is used to allow external tooling (IDEs, clang-format, ...) to process the included files individually: // If defined, the files will inject member functions into this class. // If not, the files will wrap the member functions in a class so that tooling will process them properly. #define INCLUDED_BY_EMITTER // TODO: Implement SME when it matters. #include #include #include #include #include #include #include #undef INCLUDED_BY_EMITTER protected: template uint32_t Encode_ra(T Reg) const { return Reg.Idx() << 10; } uint32_t Encode_ra(uint32_t Reg) const { return Reg << 10; } template uint32_t Encode_rt2(T Reg) const { return Reg.Idx() << 10; } uint32_t Encode_rt2(uint32_t Reg) const { return Reg << 10; } template uint32_t Encode_rm(T Reg) const { return Reg.Idx() << 16; } uint32_t Encode_rm(uint32_t Reg) const { return Reg << 16; } template uint32_t Encode_rs(T Reg) const { return Reg.Idx() << 16; } uint32_t Encode_rs(uint32_t Reg) const { return Reg << 16; } template uint32_t Encode_rn(T Reg) const { return Reg.Idx() << 5; } uint32_t Encode_rn(uint32_t Reg) const { return Reg << 5; } template uint32_t Encode_rd(T Reg) const { return Reg.Idx(); } uint32_t Encode_rd(uint32_t Reg) const { return Reg; } template uint32_t Encode_rt(T Reg) const { return Reg.Idx(); } uint32_t Encode_rt(Prefetch Reg) const { return FEXCore::ToUnderlying(Reg); } uint32_t Encode_rt(uint32_t Reg) const { return Reg; } template uint32_t Encode_pd(T Reg) const { return FEXCore::ToUnderlying(Reg); } }; } // namespace ARMEmitter ================================================ FILE: CodeEmitter/CodeEmitter/LoadstoreOps.inl ================================================ // SPDX-License-Identifier: MIT /* Load-store instruction emitters * * For GPR load-stores that take a `Size` argument as their first argument can be 32-bit or 64-bit. * For GPR load-stores that don't take a `Size` argument, then their operating size is determined by the name of the instruction. * * For Vector load-stores, most take a `SubRegSize` to determine the size of the elements getting loaded or stored. * Depending on the instruction it can be an single element or the full instruction, it depends on the instruction. * * There are some load-store helper functions which take a `ExtendedMemOperand` argument. * This helper will select the viable load-store that can work with the provided encapsulated arguments. */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif public: // Compare and swap pair void casp(ARMEmitter::Size s, ARMEmitter::Register rs, ARMEmitter::Register rs2, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { LOGMAN_THROW_A_FMT((rs.Idx() + 1) == rs2.Idx(), "These must be sequential"); LOGMAN_THROW_A_FMT((rt.Idx() + 1) == rt2.Idx(), "These must be sequential"); constexpr uint32_t Op = 0b0000'1000'001 << 21; AtomicOp(Op, s, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void caspa(ARMEmitter::Size s, ARMEmitter::Register rs, ARMEmitter::Register rs2, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { LOGMAN_THROW_A_FMT((rs.Idx() + 1) == rs2.Idx(), "These must be sequential"); LOGMAN_THROW_A_FMT((rt.Idx() + 1) == rt2.Idx(), "These must be sequential"); constexpr uint32_t Op = 0b0000'1000'001 << 21; AtomicOp(Op, s, 1, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void caspl(ARMEmitter::Size s, ARMEmitter::Register rs, ARMEmitter::Register rs2, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { LOGMAN_THROW_A_FMT((rs.Idx() + 1) == rs2.Idx(), "These must be sequential"); LOGMAN_THROW_A_FMT((rt.Idx() + 1) == rt2.Idx(), "These must be sequential"); constexpr uint32_t Op = 0b0000'1000'001 << 21; AtomicOp(Op, s, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void caspal(ARMEmitter::Size s, ARMEmitter::Register rs, ARMEmitter::Register rs2, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { LOGMAN_THROW_A_FMT((rs.Idx() + 1) == rs2.Idx(), "These must be sequential"); LOGMAN_THROW_A_FMT((rt.Idx() + 1) == rt2.Idx(), "These must be sequential"); constexpr uint32_t Op = 0b0000'1000'001 << 21; AtomicOp(Op, s, 1, 1, rs, rt, ARMEmitter::Reg::r31, rn); } // Advanced SIMD load/store multiple structures template void ld1(T rt, Register rn) { constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0111 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void ld1(T rt, T rt2, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b1010 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void ld1(T rt, T rt2, T rt3, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0110 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void ld1(T rt, T rt2, T rt3, T rt4, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0010 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st1(T rt, Register rn) { constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0111 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st1(T rt, T rt2, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b1010 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st1(T rt, T rt2, T rt3, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0110 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st1(T rt, T rt2, T rt3, T rt4, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0010 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void ld2(T rt, T rt2, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b1000 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st2(T rt, T rt2, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b1000 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void ld3(T rt, T rt2, T rt3, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0100 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st3(T rt, T rt2, T rt3, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0100 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void ld4(T rt, T rt2, T rt3, T rt4, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0000 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } template void st4(T rt, T rt2, T rt3, T rt4, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1100'000 << 21; constexpr uint32_t Opcode = 0b0000 << 12; ASIMDLoadStoreMultipleStructure(Op, Opcode, rt, rn, Reg::r0); } // Advanced SIMD load/store multiple structures (post-indexed) static constexpr uint32_t ASIMDLoadstoreMultiplePost_Op = 0b0000'1100'100 << 21; template void ld1(T rt, Register rn, Register rm) { constexpr uint32_t Opcode = 0b0111 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld1(T rt, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 16)) || (std::is_same_v && (PostOffset == 8)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0111 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void ld1(T rt, T rt2, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Opcode = 0b1010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld1(T rt, T rt2, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 32)) || (std::is_same_v && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b1010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void ld1(T rt, T rt2, T rt3, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Opcode = 0b0110 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld1(T rt, T rt2, T rt3, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 48)) || (std::is_same_v && (PostOffset == 24)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0110 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void ld1(T rt, T rt2, T rt3, T rt4, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Opcode = 0b0010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld1(T rt, T rt2, T rt3, T rt4, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 64)) || (std::is_same_v && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st1(T rt, Register rn, Register rm) { constexpr uint32_t Opcode = 0b0111 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st1(T rt, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 16)) || (std::is_same_v && (PostOffset == 8)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0111 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st1(T rt, T rt2, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Opcode = 0b1010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st1(T rt, T rt2, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 32)) || (std::is_same_v && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b1010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st1(T rt, T rt2, T rt3, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Opcode = 0b0110 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st1(T rt, T rt2, T rt3, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 48)) || (std::is_same_v && (PostOffset == 24)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0110 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st1(T rt, T rt2, T rt3, T rt4, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Opcode = 0b0010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st1(T rt, T rt2, T rt3, T rt4, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 64)) || (std::is_same_v && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0010 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void ld2(T rt, T rt2, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Opcode = 0b1000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld2(T rt, T rt2, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 32)) || (std::is_same_v && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b1000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st2(T rt, T rt2, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Opcode = 0b1000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st2(T rt, T rt2, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 32)) || (std::is_same_v && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b1000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void ld3(T rt, T rt2, T rt3, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Opcode = 0b0100 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld3(T rt, T rt2, T rt3, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 48)) || (std::is_same_v && (PostOffset == 24)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0100 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st3(T rt, T rt2, T rt3, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Opcode = 0b0100 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st3(T rt, T rt2, T rt3, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 48)) || (std::is_same_v && (PostOffset == 24)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0100 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void ld4(T rt, T rt2, T rt3, T rt4, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Opcode = 0b0000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void ld4(T rt, T rt2, T rt3, T rt4, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 64)) || (std::is_same_v && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } template void st4(T rt, T rt2, T rt3, T rt4, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Opcode = 0b0000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, rm); } template void st4(T rt, T rt2, T rt3, T rt4, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); LOGMAN_THROW_A_FMT((std::is_same_v && (PostOffset == 64)) || (std::is_same_v && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Opcode = 0b0000 << 12; ASIMDLoadStoreMultipleStructure(ASIMDLoadstoreMultiplePost_Op, Opcode, rt, rn, Reg::r31); } // ASIMD loadstore single template void st1(VRegister rt, uint32_t Index, Register rn) { constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void st2(VRegister rt, VRegister rt2, uint32_t Index, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void st3(VRegister rt, VRegister rt2, VRegister rt3, uint32_t Index, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void st4(VRegister rt, VRegister rt2, VRegister rt3, VRegister rt4, uint32_t Index, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void ld1(VRegister rt, uint32_t Index, Register rn) { constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void ld1r(T rt, Register rn) { constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, rn, Reg::r0); } template void ld2(VRegister rt, VRegister rt2, uint32_t Index, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void ld2r(T rt, T rt2, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, rn, Reg::r0); } template void ld3(VRegister rt, VRegister rt2, VRegister rt3, uint32_t Index, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void ld3r(T rt, T rt2, T rt3, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, rn, Reg::r0); } template void ld4(VRegister rt, VRegister rt2, VRegister rt3, VRegister rt4, uint32_t Index, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r0); } template void ld4r(T rt, T rt2, T rt3, T rt4, Register rn) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'000 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, rn, Reg::r0); } // ASIMD loadstore single post-indexed template void st1(VRegister rt, uint32_t Index, Register rn, Register rm) { constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void st1(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 1)) || (size == SubRegSize::i16Bit && (PostOffset == 2)) || (size == SubRegSize::i32Bit && (PostOffset == 4)) || (size == SubRegSize::i64Bit && (PostOffset == 8)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void st2(VRegister rt, VRegister rt2, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void st2(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 2)) || (size == SubRegSize::i16Bit && (PostOffset == 4)) || (size == SubRegSize::i32Bit && (PostOffset == 8)) || (size == SubRegSize::i64Bit && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void st3(VRegister rt, VRegister rt2, VRegister rt3, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void st3(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 3)) || (size == SubRegSize::i16Bit && (PostOffset == 6)) || (size == SubRegSize::i32Bit && (PostOffset == 8)) || (size == SubRegSize::i64Bit && (PostOffset == 24)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void st4(VRegister rt, VRegister rt2, VRegister rt3, VRegister rt4, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void st4(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 4)) || (size == SubRegSize::i16Bit && (PostOffset == 8)) || (size == SubRegSize::i32Bit && (PostOffset == 16)) || (size == SubRegSize::i64Bit && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void ld1(VRegister rt, uint32_t Index, Register rn, Register rm) { constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void ld1(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 1)) || (size == SubRegSize::i16Bit && (PostOffset == 2)) || (size == SubRegSize::i32Bit && (PostOffset == 4)) || (size == SubRegSize::i64Bit && (PostOffset == 8)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void ld1r(VRegister rt, Register rn, Register rm) { constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, rm); } template void ld1r(VRegister rt, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 1)) || (size == SubRegSize::i16Bit && (PostOffset == 2)) || (size == SubRegSize::i32Bit && (PostOffset == 4)) || (size == SubRegSize::i64Bit && (PostOffset == 8)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, Reg::r31); } template void ld2(VRegister rt, VRegister rt2, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void ld2(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 2)) || (size == SubRegSize::i16Bit && (PostOffset == 4)) || (size == SubRegSize::i32Bit && (PostOffset == 8)) || (size == SubRegSize::i64Bit && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void ld2r(VRegister rt, VRegister rt2, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, rm); } template void ld2r(VRegister rt, VRegister rt2, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 2)) || (size == SubRegSize::i16Bit && (PostOffset == 4)) || (size == SubRegSize::i32Bit && (PostOffset == 8)) || (size == SubRegSize::i64Bit && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, Reg::r31); } template void ld3(VRegister rt, VRegister rt2, VRegister rt3, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void ld3(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 3)) || (size == SubRegSize::i16Bit && (PostOffset == 6)) || (size == SubRegSize::i32Bit && (PostOffset == 12)) || (size == SubRegSize::i64Bit && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void ld3r(VRegister rt, VRegister rt2, VRegister rt3, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, rm); } template void ld3r(VRegister rt, VRegister rt2, VRegister rt3, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 3)) || (size == SubRegSize::i16Bit && (PostOffset == 6)) || (size == SubRegSize::i32Bit && (PostOffset == 12)) || (size == SubRegSize::i64Bit && (PostOffset == 16)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, Reg::r31); } template void ld4(VRegister rt, VRegister rt2, VRegister rt3, VRegister rt4, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, rm); } template void ld4(VRegister rt, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 4)) || (size == SubRegSize::i16Bit && (PostOffset == 8)) || (size == SubRegSize::i32Bit && (PostOffset == 16)) || (size == SubRegSize::i64Bit && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = size == SubRegSize::i8Bit ? 0b000 : // Scale = 0 size == SubRegSize::i16Bit ? 0b010 : // Scale = 1 size == SubRegSize::i32Bit ? 0b100 : // Scale = 2 size == SubRegSize::i64Bit ? 0b100 : // Scale = 2 (Uses size to determine difference between 32-bit). 0; ASIMDSTLD(Op, Opcode, rt, Index, rn, Reg::r31); } template void ld4r(VRegister rt, VRegister rt2, VRegister rt3, VRegister rt4, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, rm); } template void ld4r(VRegister rt, VRegister rt2, VRegister rt3, VRegister rt4, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); LOGMAN_THROW_A_FMT((size == SubRegSize::i8Bit && (PostOffset == 4)) || (size == SubRegSize::i16Bit && (PostOffset == 8)) || (size == SubRegSize::i32Bit && (PostOffset == 16)) || (size == SubRegSize::i64Bit && (PostOffset == 32)), "Post-index offset needs to match number of elements times their size"); constexpr uint32_t Op = 0b0000'1101'100 << 21; constexpr uint32_t Opcode = 0b110; ASIMDSTLD(Op, Opcode, rt, 0, rn, Reg::r31); } // Advanced SIMD load/store single structure (post-indexed) template void st1(ARMEmitter::SubRegSize size, T rt, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == SubRegSizeInBits(size), "Post-Index size must match element size"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b000; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b010; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b100; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, ARMEmitter::Reg::r31, rn, rt.Q()); } template void ld1(ARMEmitter::SubRegSize size, T rt, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == SubRegSizeInBits(size), "Post-Index size must match element size"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, ARMEmitter::Reg::r31, rn, rt.Q()); } template void ld1r(ARMEmitter::SubRegSize size, T rt, ARMEmitter::Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(PostOffset == 1 || PostOffset == 2 || PostOffset == 4 || PostOffset == 8, "Index too large"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 0; uint32_t opcode = 0b110; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, ARMEmitter::Reg::r31, rn, rt); } template void ld2r(SubRegSize size, T rt, T rt2, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); LOGMAN_THROW_A_FMT(PostOffset == 2 || PostOffset == 4 || PostOffset == 8 || PostOffset == 16, "Index too large"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 1; uint32_t opcode = 0b110; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, Reg::r31, rn, rt); } template void ld3r(SubRegSize size, T rt, T rt2, T rt3, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); LOGMAN_THROW_A_FMT(PostOffset == 3 || PostOffset == 6 || PostOffset == 12 || PostOffset == 24, "Index too large"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 0; uint32_t opcode = 0b111; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, Reg::r31, rn, rt); } template void ld4r(SubRegSize size, T rt, T rt2, T rt3, T rt4, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); LOGMAN_THROW_A_FMT(PostOffset == 4 || PostOffset == 8 || PostOffset == 16 || PostOffset == 32, "Index too large"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 1; uint32_t opcode = 0b111; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, Reg::r31, rn, rt); } template void st2(SubRegSize size, T rt, T rt2, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == (SubRegSizeInBits(size) * 2), "Post-Index size must match element size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b000; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b010; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b100; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, Reg::r31, rn, rt.Q()); } template void ld2(SubRegSize size, T rt, T rt2, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == (SubRegSizeInBits(size) * 2), "Post-Index size must match element size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b000; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b010; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b100; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, Reg::r31, rn, rt.Q()); } template void st3(SubRegSize size, T rt, T rt2, T rt3, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == (SubRegSizeInBits(size) * 3), "Post-Index size must match element size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, Reg::r31, rn, rt.Q()); } template void ld3(SubRegSize size, T rt, T rt2, T rt3, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == (SubRegSizeInBits(size) * 3), "Post-Index size must match element size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, Reg::r31, rn, rt.Q()); } template void st4(SubRegSize size, T rt, T rt2, T rt3, T rt4, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == (SubRegSizeInBits(size) * 4), "Post-Index size must match element size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, Reg::r31, rn, rt.Q()); } template void ld4(SubRegSize size, T rt, T rt2, T rt3, T rt4, uint32_t Index, Register rn, uint32_t PostOffset) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT((PostOffset * 8) == (SubRegSizeInBits(size) * 4), "Post-Index size must match element size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, Reg::r31, rn, rt.Q()); } template void st1(ARMEmitter::SubRegSize size, T rt, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b000; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b010; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b100; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, rm, rn, rt.Q()); } template void ld1(ARMEmitter::SubRegSize size, T rt, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt.Q()); } template void ld1r(SubRegSize size, T rt, Register rn, Register rm) { constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 0; uint32_t opcode = 0b110; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt); } template void ld2r(SubRegSize size, T rt, T rt2, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 1; uint32_t opcode = 0b110; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt); } template void ld3r(SubRegSize size, T rt, T rt2, T rt3, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 0; uint32_t opcode = 0b111; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt); } template void ld4r(SubRegSize size, T rt, T rt2, T rt3, T rt4, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; constexpr uint32_t Q = std::is_same_v ? 1 : 0; uint32_t R = 1; uint32_t opcode = 0b111; uint32_t S = 0; uint32_t Size = FEXCore::ToUnderlying(size); ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt); } template void st2(SubRegSize size, T rt, T rt2, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b000; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b010; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b100; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, rm, rn, rt.Q()); } template void ld2(SubRegSize size, T rt, T rt2, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2), "rt and rt2 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b000; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b010; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b100; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b100; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt.Q()); } template void st3(SubRegSize size, T rt, T rt2, T rt3, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, rm, rn, rt.Q()); } template void ld3(SubRegSize size, T rt, T rt2, T rt3, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3), "rt, rt2, and rt3 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 0; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt.Q()); } template void st4(SubRegSize size, T rt, T rt2, T rt3, T rt4, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 0, R, opcode, S, Size, rm, rn, rt.Q()); } template void ld4(SubRegSize size, T rt, T rt2, T rt3, T rt4, uint32_t Index, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Incorrect size"); LOGMAN_THROW_A_FMT(AreVectorsSequential(rt, rt2, rt3, rt4), "rt, rt2, rt3, and rt4 must be sequential"); constexpr uint32_t Op = 0b0000'1101'1 << 23; uint32_t Q; uint32_t R = 1; uint32_t opcode; uint32_t S; uint32_t Size; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(Index < 16, "Index too large"); Q = Index >> 3; S = (Index >> 2) & 1; opcode = 0b001; Size = Index & 0b11; } else if (size == SubRegSize::i16Bit) { LOGMAN_THROW_A_FMT(Index < 8, "Index too large"); Q = Index >> 2; S = (Index >> 1) & 1; opcode = 0b011; Size = (Index & 0b1) << 1; } else if (size == SubRegSize::i32Bit) { LOGMAN_THROW_A_FMT(Index < 4, "Index too large"); Q = Index >> 1; S = Index & 1; opcode = 0b101; Size = 0b00; } else if (size == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(Index < 2, "Index too large"); Q = Index; S = 0; opcode = 0b101; Size = 0b01; } else { LOGMAN_MSG_A_FMT("Unknown size"); FEX_UNREACHABLE; } ASIMDLoadStoreSinglePost(Op, Q, 1, R, opcode, S, Size, rm, rn, rt.Q()); } template void st1(T rt, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { st1(size, rt, Index, rn, PostOffset); } template void ld1(T rt, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { ld1(size, rt, Index, rn, PostOffset); } template void ld1r(T rt, ARMEmitter::Register rn, uint32_t PostOffset) { ld1r(size, rt, rn, PostOffset); } template void ld2r(T rt, T rt2, ARMEmitter::Register rn, uint32_t PostOffset) { ld2r(size, rt, rt2, rn, PostOffset); } template void ld3r(T rt, T rt2, T rt3, ARMEmitter::Register rn, uint32_t PostOffset) { ld3r(size, rt, rt2, rt3, rn, PostOffset); } template void ld4r(T rt, T rt2, T rt3, T rt4, ARMEmitter::Register rn, uint32_t PostOffset) { ld4r(size, rt, rt2, rt3, rt4, rn, PostOffset); } template void st2(T rt, T rt2, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { st2(size, rt, rt2, Index, rn, PostOffset); } template void ld2(T rt, T rt2, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { ld2(size, rt, rt2, Index, rn, PostOffset); } template void st3(T rt, T rt2, T rt3, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { st3(size, rt, rt2, rt3, Index, rn, PostOffset); } template void ld3(T rt, T rt2, T rt3, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { ld3(size, rt, rt2, rt3, Index, rn, PostOffset); } template void st4(T rt, T rt2, T rt3, T rt4, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { st4(size, rt, rt2, rt3, rt4, Index, rn, PostOffset); } template void ld4(T rt, T rt2, T rt3, T rt4, uint32_t Index, ARMEmitter::Register rn, uint32_t PostOffset) { ld4(size, rt, rt2, rt3, rt4, Index, rn, PostOffset); } template void st1(T rt, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { st1(size, rt, Index, rn, rm); } template void ld1(T rt, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld1(size, rt, Index, rn, rm); } template void ld1r(T rt, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld1r(size, rt, rn, rm); } template void ld2r(T rt, T rt2, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld2r(size, rt, rt2, rn, rm); } template void ld3r(T rt, T rt2, T rt3, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld3r(size, rt, rt2, rt3, rn, rm); } template void ld4r(T rt, T rt2, T rt3, T rt4, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld4r(size, rt, rt2, rt3, rt4, rn, rm); } template void st2(T rt, T rt2, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { st2(size, rt, rt2, Index, rn, rm); } template void ld2(T rt, T rt2, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld2(size, rt, rt2, Index, rn, rm); } template void st3(T rt, T rt2, T rt3, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { st3(size, rt, rt2, rt3, Index, rn, rm); } template void ld3(T rt, T rt2, T rt3, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld3(size, rt, rt2, rt3, Index, rn, rm); } template void st4(T rt, T rt2, T rt3, T rt4, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { st4(size, rt, rt2, rt3, rt4, Index, rn, rm); } template void ld4(T rt, T rt2, T rt3, T rt4, uint32_t Index, ARMEmitter::Register rn, ARMEmitter::Register rm) { ld4(size, rt, rt2, rt3, rt4, Index, rn, rm); } template void ASIMDLoadStoreSinglePost(uint32_t Op, uint32_t Q, uint32_t L, uint32_t R, uint32_t opcode, uint32_t S, uint32_t size, ARMEmitter::Register rm, ARMEmitter::Register rn, T rt) { LOGMAN_THROW_A_FMT((std::is_same_v || std::is_same_v), "Only supports 128-bit and " "64-bit vector registers."); uint32_t Instr = Op; Instr |= Q << 30; Instr |= L << 22; Instr |= R << 21; Instr |= Encode_rm(rm); Instr |= opcode << 13; Instr |= S << 12; Instr |= size << 10; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } // Loadstore exclusive pair void stxp(ARMEmitter::Size s, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b1000'1000'001 << 21; AtomicOp(Op, s, 0, 0, rs, rt, rt2, rn); } void stlxp(ARMEmitter::Size s, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b1000'1000'001 << 21; AtomicOp(Op, s, 0, 1, rs, rt, rt2, rn); } void ldxp(ARMEmitter::Size s, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b1000'1000'001 << 21; AtomicOp(Op, s, 1, 0, ARMEmitter::Reg::r31, rt, rt2, rn); } void ldaxp(ARMEmitter::Size s, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b1000'1000'001 << 21; AtomicOp(Op, s, 1, 1, ARMEmitter::Reg::r31, rt, rt2, rn); } // Loadstore exclusive register void stxrb(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void stlxrb(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void ldxrb(ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 1, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldaxrb(ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 1, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void stxrh(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void stlxrh(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void ldxrh(ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 1, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldaxrh(ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 1, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void stxr(ARMEmitter::WRegister rs, ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 0, 0, rs, rt, ARMEmitter::WReg::w31, rn); } void stlxr(ARMEmitter::WRegister rs, ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 0, 1, rs, rt, ARMEmitter::WReg::w31, rn); } void ldxr(ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 1, 0, ARMEmitter::WReg::w31, rt, ARMEmitter::WReg::w31, rn); } void ldaxr(ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 1, 1, ARMEmitter::WReg::w31, rt, ARMEmitter::WReg::w31, rn); } void stxr(ARMEmitter::XRegister rs, ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 0, 0, rs, rt, ARMEmitter::XReg::x31, rn); } void stlxr(ARMEmitter::WRegister rs, ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 0, 1, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void ldxr(ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 1, 0, ARMEmitter::XReg::x31, rt, ARMEmitter::XReg::x31, rn); } void ldaxr(ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 1, 1, ARMEmitter::XReg::x31, rt, ARMEmitter::XReg::x31, rn); } void stxr(ARMEmitter::SubRegSize size, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, size, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void stlxr(ARMEmitter::SubRegSize size, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, size, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void ldxr(ARMEmitter::SubRegSize size, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, size, 1, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldaxr(ARMEmitter::SubRegSize size, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'000 << 21; SubAtomicOp(Op, size, 1, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } // Load/store ordered static constexpr uint32_t LoadStoreOrdered_Op = 0b0000'1000'100 << 21; void stllrb(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i8Bit, 0, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void stlrb(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i8Bit, 0, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldlarb(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i8Bit, 1, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldarb(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i8Bit, 1, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void stllrh(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i16Bit, 0, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void stlrh(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i16Bit, 0, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldlarh(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i16Bit, 1, 0, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void ldarh(ARMEmitter::Register rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i16Bit, 1, 1, ARMEmitter::Reg::r31, rt, ARMEmitter::Reg::r31, rn); } void stllr(ARMEmitter::WRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i32Bit, 0, 0, ARMEmitter::WReg::w31, rt, ARMEmitter::WReg::w31, rn); } void stlr(ARMEmitter::WRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i32Bit, 0, 1, ARMEmitter::WReg::w31, rt, ARMEmitter::WReg::w31, rn); } void ldlar(ARMEmitter::WRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i32Bit, 1, 0, ARMEmitter::WReg::w31, rt, ARMEmitter::WReg::w31, rn); } void ldar(ARMEmitter::WRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i32Bit, 1, 1, ARMEmitter::WReg::w31, rt, ARMEmitter::WReg::w31, rn); } void stllr(ARMEmitter::XRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i64Bit, 0, 0, ARMEmitter::XReg::x31, rt, ARMEmitter::XReg::x31, rn); } void stlr(ARMEmitter::XRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i64Bit, 0, 1, ARMEmitter::XReg::x31, rt, ARMEmitter::XReg::x31, rn); } void ldlar(ARMEmitter::XRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i64Bit, 1, 0, ARMEmitter::XReg::x31, rt, ARMEmitter::XReg::x31, rn); } void ldar(ARMEmitter::XRegister rt, ARMEmitter::Register rn) { SubAtomicOp(LoadStoreOrdered_Op, ARMEmitter::SubRegSize::i64Bit, 1, 1, ARMEmitter::XReg::x31, rt, ARMEmitter::XReg::x31, rn); } // Compare and swap void casb(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void caslb(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void casab(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 1, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void casalb(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i8Bit, 1, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void cash(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void caslh(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void casah(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 1, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void casalh(ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i16Bit, 1, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void cas(ARMEmitter::WRegister rs, ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 0, 0, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void casl(ARMEmitter::WRegister rs, ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 0, 1, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void casa(ARMEmitter::WRegister rs, ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 1, 0, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void casal(ARMEmitter::WRegister rs, ARMEmitter::WRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i32Bit, 1, 1, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void cas(ARMEmitter::XRegister rs, ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 0, 0, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void casl(ARMEmitter::XRegister rs, ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 0, 1, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void casa(ARMEmitter::XRegister rs, ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 1, 0, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void casal(ARMEmitter::XRegister rs, ARMEmitter::XRegister rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, ARMEmitter::SubRegSize::i64Bit, 1, 1, rs.R(), rt.R(), ARMEmitter::Reg::r31, rn); } void cas(ARMEmitter::SubRegSize size, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, size, 0, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void casl(ARMEmitter::SubRegSize size, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, size, 0, 1, rs, rt, ARMEmitter::Reg::r31, rn); } void casa(ARMEmitter::SubRegSize size, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, size, 1, 0, rs, rt, ARMEmitter::Reg::r31, rn); } void casal(ARMEmitter::SubRegSize size, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rn) { constexpr uint32_t Op = 0b0000'1000'101 << 21; SubAtomicOp(Op, size, 1, 1, rs, rt, ARMEmitter::Reg::r31, rn); } // LDAPR/STLR unscaled immediate void stlurb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i8Bit, 0b00, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapurb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i8Bit, 0b01, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapursb(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i8Bit, 0b11, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapursb(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i8Bit, 0b10, rt, rn, static_cast(Imm) & 0x1'FF); } void stlurh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i16Bit, 0b00, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapurh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i16Bit, 0b01, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapursh(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i16Bit, 0b11, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapursh(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i16Bit, 0b10, rt, rn, static_cast(Imm) & 0x1'FF); } void stlur(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i32Bit, 0b00, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapur(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i32Bit, 0b01, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapursw(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i32Bit, 0b10, rt, rn, static_cast(Imm) & 0x1'FF); } void stlur(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i64Bit, 0b00, rt, rn, static_cast(Imm) & 0x1'FF); } void ldapur(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1001'000 << 21; SubAtomicImm(Op, ARMEmitter::SubRegSize::i64Bit, 0b01, rt, rn, static_cast(Imm) & 0x1'FF); } // Load register literal void ldr(ARMEmitter::WRegister rt, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1000 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::SRegister rt, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1100 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::XRegister rt, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0101'1000 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::DRegister rt, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0101'1100 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldrs(ARMEmitter::WRegister rt, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1001'1000 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::QRegister rt, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1001'1100 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void prfm(ARMEmitter::Prefetch prfop, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1101'1000 << 24; LoadStoreLiteral(Op, prfop, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::WRegister rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1000 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::SRegister rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0001'1100 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::XRegister rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0101'1000 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::DRegister rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0101'1100 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldrsw(ARMEmitter::XRegister rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1001'1000 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::QRegister rt, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1001'1100 << 24; LoadStoreLiteral(Op, rt, static_cast(Imm >> 2) & 0x7'FFFF); } void prfm(ARMEmitter::Prefetch prfop, const BackwardLabel* Label) { int32_t Imm = static_cast(Label->Location - GetCursorAddress()); LOGMAN_THROW_A_FMT(Imm >= -1048576 && Imm <= 1048575 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1101'1000 << 24; LoadStoreLiteral(Op, prfop, static_cast(Imm >> 2) & 0x7'FFFF); } void ldr(ARMEmitter::WRegister rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b0001'1000 << 24; LoadStoreLiteral(Op, rt, 0); } void ldr(ARMEmitter::SRegister rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b0001'1100 << 24; LoadStoreLiteral(Op, rt, 0); } void ldr(ARMEmitter::XRegister rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b0101'1000 << 24; LoadStoreLiteral(Op, rt, 0); } void ldr(ARMEmitter::DRegister rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b0101'1100 << 24; LoadStoreLiteral(Op, rt, 0); } void ldrsw(ARMEmitter::XRegister rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b1001'1000 << 24; LoadStoreLiteral(Op, rt, 0); } void ldr(ARMEmitter::QRegister rt, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b1001'1100 << 24; LoadStoreLiteral(Op, rt, 0); } void prfm(ARMEmitter::Prefetch prfop, ForwardLabel* Label) { AddLocationToLabel(Label, ForwardLabel::Reference {.Location = GetCursorAddress(), .Type = ForwardLabel::InstType::RELATIVE_LOAD}); constexpr uint32_t Op = 0b1101'1000 << 24; LoadStoreLiteral(Op, prfop, 0); } void ldr(ARMEmitter::WRegister rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { ldr(rt, &Label->Backward); } else { ldr(rt, &Label->Forward); } } void ldr(ARMEmitter::SRegister rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { ldr(rt, &Label->Backward); } else { ldr(rt, &Label->Forward); } } void ldr(ARMEmitter::XRegister rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { ldr(rt, &Label->Backward); } else { ldr(rt, &Label->Forward); } } void ldr(ARMEmitter::DRegister rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { ldr(rt, &Label->Backward); } else { ldr(rt, &Label->Forward); } } void ldrs(ARMEmitter::WRegister rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { ldr(rt, &Label->Backward); } else { ldr(rt, &Label->Forward); } } void ldr(ARMEmitter::QRegister rt, BiDirectionalLabel* Label) { if (Label->Backward.Location) { ldr(rt, &Label->Backward); } else { ldr(rt, &Label->Forward); } } void prfm(ARMEmitter::Prefetch prfop, BiDirectionalLabel* Label) { if (Label->Backward.Location) { prfm(prfop, &Label->Backward); } else { prfm(prfop, &Label->Forward); } } // Memory copy/set void cpyfp(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0000, rs, rn, rd); } void cpyfm(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0000, rs, rn, rd); } void cpyfe(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0000, rs, rn, rd); } void cpyfpwt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0001, rs, rn, rd); } void cpyfmwt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0001, rs, rn, rd); } void cpyfewt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0001, rs, rn, rd); } void cpyfprt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0010, rs, rn, rd); } void cpyfmrt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0010, rs, rn, rd); } void cpyfert(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0010, rs, rn, rd); } void cpyfpt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0011, rs, rn, rd); } void cpyfmt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0011, rs, rn, rd); } void cpyfet(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0011, rs, rn, rd); } void cpyfpwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0100, rs, rn, rd); } void cpyfmwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0100, rs, rn, rd); } void cpyfewn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0100, rs, rn, rd); } void cpyfpwtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0101, rs, rn, rd); } void cpyfmwtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0101, rs, rn, rd); } void cpyfewtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0101, rs, rn, rd); } void cpyfprtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0110, rs, rn, rd); } void cpyfmrtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0110, rs, rn, rd); } void cpyfertwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0110, rs, rn, rd); } void cpyfptwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b0111, rs, rn, rd); } void cpyfmtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b0111, rs, rn, rd); } void cpyfetwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b0111, rs, rn, rd); } void cpyfprn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1000, rs, rn, rd); } void cpyfmrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1000, rs, rn, rd); } void cpyfern(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1000, rs, rn, rd); } void cpyfpwtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1001, rs, rn, rd); } void cpyfmwtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1001, rs, rn, rd); } void cpyfewtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1001, rs, rn, rd); } void cpyfprtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1010, rs, rn, rd); } void cpyfmrtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1010, rs, rn, rd); } void cpyfertrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1010, rs, rn, rd); } void cpyfptrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1011, rs, rn, rd); } void cpyfmtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1011, rs, rn, rd); } void cpyfetrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1011, rs, rn, rd); } void cpyfpn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1100, rs, rn, rd); } void cpyfmn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1100, rs, rn, rd); } void cpyfen(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1100, rs, rn, rd); } void cpyfpwtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1101, rs, rn, rd); } void cpyfmwtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1101, rs, rn, rd); } void cpyfewtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1101, rs, rn, rd); } void cpyfprtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1110, rs, rn, rd); } void cpyfmrtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1110, rs, rn, rd); } void cpyfertn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1110, rs, rn, rd); } void cpyfptn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b00, 0b1111, rs, rn, rd); } void cpyfmtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b01, 0b1111, rs, rn, rd); } void cpyfetn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 0, 0b10, 0b1111, rs, rn, rd); } void setp(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0000, rs, rn, rd); } void setm(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0100, rs, rn, rd); } void sete(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b1000, rs, rn, rd); } void setpt(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0001, rs, rn, rd); } void setmt(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0101, rs, rn, rd); } void setet(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b1001, rs, rn, rd); } void setpn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0010, rs, rn, rd); } void setmn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0110, rs, rn, rd); } void seten(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b1010, rs, rn, rd); } void setptn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0011, rs, rn, rd); } void setmtn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b0111, rs, rn, rd); } void setetn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 0, 0b11, 0b1011, rs, rn, rd); } void cpyp(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0000, rs, rn, rd); } void cpym(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0000, rs, rn, rd); } void cpye(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0000, rs, rn, rd); } void cpypwt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0001, rs, rn, rd); } void cpymwt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0001, rs, rn, rd); } void cpyewt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0001, rs, rn, rd); } void cpyprt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0010, rs, rn, rd); } void cpymrt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0010, rs, rn, rd); } void cpyert(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0010, rs, rn, rd); } void cpypt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0011, rs, rn, rd); } void cpymt(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0011, rs, rn, rd); } void cpyet(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0011, rs, rn, rd); } void cpypwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0100, rs, rn, rd); } void cpymwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0100, rs, rn, rd); } void cpyewn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0100, rs, rn, rd); } void cpypwtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0101, rs, rn, rd); } void cpymwtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0101, rs, rn, rd); } void cpyewtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0101, rs, rn, rd); } void cpyprtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0110, rs, rn, rd); } void cpymrtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0110, rs, rn, rd); } void cpyertwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0110, rs, rn, rd); } void cpyptwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b0111, rs, rn, rd); } void cpymtwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b0111, rs, rn, rd); } void cpyetwn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b0111, rs, rn, rd); } void cpyprn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1000, rs, rn, rd); } void cpymrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1000, rs, rn, rd); } void cpyern(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1000, rs, rn, rd); } void cpypwtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1001, rs, rn, rd); } void cpymwtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1001, rs, rn, rd); } void cpyewtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1001, rs, rn, rd); } void cpyprtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1010, rs, rn, rd); } void cpymrtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1010, rs, rn, rd); } void cpyertrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1010, rs, rn, rd); } void cpyptrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1011, rs, rn, rd); } void cpymtrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1011, rs, rn, rd); } void cpyetrn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1011, rs, rn, rd); } void cpypn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1100, rs, rn, rd); } void cpymn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1100, rs, rn, rd); } void cpyen(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1100, rs, rn, rd); } void cpypwtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1101, rs, rn, rd); } void cpymwtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1101, rs, rn, rd); } void cpyewtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1101, rs, rn, rd); } void cpyprtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1110, rs, rn, rd); } void cpymrtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1110, rs, rn, rd); } void cpyertn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1110, rs, rn, rd); } void cpyptn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b00, 0b1111, rs, rn, rd); } void cpymtn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b01, 0b1111, rs, rn, rd); } void cpyetn(Register rd, Register rs, Register rn) { MemoryCopyAndMemorySet(0, 1, 0b10, 0b1111, rs, rn, rd); } void setgp(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0000, rs, rn, rd); } void setgm(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0100, rs, rn, rd); } void setge(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b1000, rs, rn, rd); } void setgpt(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0001, rs, rn, rd); } void setgmt(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0101, rs, rn, rd); } void setget(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b1001, rs, rn, rd); } void setgpn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0010, rs, rn, rd); } void setgmn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0110, rs, rn, rd); } void setgen(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b1010, rs, rn, rd); } void setgptn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0011, rs, rn, rd); } void setgmtn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b0111, rs, rn, rd); } void setgetn(Register rd, Register rn, Register rs) { MemoryCopyAndMemorySet(0, 1, 0b11, 0b1011, rs, rn, rd); } // Loadstore no-allocate pair void stnp(ARMEmitter::WRegister rt, ARMEmitter::WRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0010'1000'00 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 2) & 0b111'1111); } void ldnp(ARMEmitter::WRegister rt, ARMEmitter::WRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0010'1000'01 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 2) & 0b111'1111); } void stnp(ARMEmitter::SRegister rt, ARMEmitter::SRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0010'1100'00 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 2) & 0b111'1111); } void ldnp(ARMEmitter::SRegister rt, ARMEmitter::SRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0010'1100'01 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 2) & 0b111'1111); } void stnp(ARMEmitter::XRegister rt, ARMEmitter::XRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1010'1000'00 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 3) & 0b111'1111); } void ldnp(ARMEmitter::XRegister rt, ARMEmitter::XRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1010'1000'01 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 3) & 0b111'1111); } void stnp(ARMEmitter::DRegister rt, ARMEmitter::DRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0110'1100'00 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 3) & 0b111'1111); } void ldnp(ARMEmitter::DRegister rt, ARMEmitter::DRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b0110'1100'01 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 3) & 0b111'1111); } void stnp(ARMEmitter::QRegister rt, ARMEmitter::QRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1024 && Imm <= 1008 && ((Imm & 0b1111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1010'1100'00 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 4) & 0b111'1111); } void ldnp(ARMEmitter::QRegister rt, ARMEmitter::QRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1024 && Imm <= 1008 && ((Imm & 0b1111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = 0b1010'1100'01 << 22; LoadStoreNoAllocate(Op, rt, rt2, rn, static_cast(Imm >> 4) & 0b111'1111); } // Loadstore register pair post-indexed // Loadstore register pair offset // Loadstore register pair pre-indexed template void stp(ARMEmitter::WRegister rt, ARMEmitter::WRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0010'1000'00 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 2) & 0b111'1111); } template void ldp(ARMEmitter::WRegister rt, ARMEmitter::WRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0010'1000'01 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 2) & 0b111'1111); } template void ldpsw(ARMEmitter::XRegister rt, ARMEmitter::XRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0110'1000'01 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 2) & 0b111'1111); } template void stp(ARMEmitter::XRegister rt, ARMEmitter::XRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b1010'1000'00 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 3) & 0b111'1111); } template void ldp(ARMEmitter::XRegister rt, ARMEmitter::XRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b1010'1000'01 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 3) & 0b111'1111); } template void stp(ARMEmitter::SRegister rt, ARMEmitter::SRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { stp_w(rt.V(), rt2.V(), rn, Imm); } template void ldp(ARMEmitter::SRegister rt, ARMEmitter::SRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { ldp_w(rt.V(), rt2.V(), rn, Imm); } template void stp(ARMEmitter::DRegister rt, ARMEmitter::DRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { stp_x(rt.V(), rt2.V(), rn, Imm); } template void ldp(ARMEmitter::DRegister rt, ARMEmitter::DRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { ldp_x(rt.V(), rt2.V(), rn, Imm); } template void stp(ARMEmitter::QRegister rt, ARMEmitter::QRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { stp_q(rt.V(), rt2.V(), rn, Imm); } template void ldp(ARMEmitter::QRegister rt, ARMEmitter::QRegister rt2, ARMEmitter::Register rn, int32_t Imm = 0) { ldp_q(rt.V(), rt2.V(), rn, Imm); } // Loadstore register unscaled immediate void sturb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrb(rt, rn, Imm); } void ldurb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrb(rt, rn, Imm); } void sturb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrb(rt, rn, Imm); } void ldurb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrb(rt, rn, Imm); } void ldursb(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsb(rt, rn, Imm); } void ldursb(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsb(rt, rn, Imm); } void sturh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrh(rt, rn, Imm); } void ldurh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrh(rt, rn, Imm); } void sturh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrh(rt, rn, Imm); } void ldurh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrh(rt, rn, Imm); } void ldursh(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsh(rt, rn, Imm); } void ldursh(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsh(rt, rn, Imm); } void stur(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldur(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } void stur(ARMEmitter::SRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldur(ARMEmitter::SRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } void ldursw(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsw(rt, rn, Imm); } void stur(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldur(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } void stur(ARMEmitter::DRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldur(ARMEmitter::DRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } void stur(ARMEmitter::QRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldur(ARMEmitter::QRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } template void prfum(ARMEmitter::Prefetch prfop, ARMEmitter::Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); static_assert(Index == IndexType::OFFSET, "Doesn't support another index type"); constexpr uint32_t Op = 0b1111'1000'10 << 22; constexpr uint32_t o2 = 0b00; LoadStoreImm(Op, o2, prfop, rn, Imm); } // Loadstore register immediate post-indexed // Loadstore register immediate pre-indexed template requires (Index == IndexType::POST || Index == IndexType::PRE) void strb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrb(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrb(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void strb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrb(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrb(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrsb(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsb(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrsb(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsb(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void strh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrh(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrh(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void strh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrh(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrh(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrsh(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsh(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrsh(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsh(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void str(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldr(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void str(ARMEmitter::SRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldr(ARMEmitter::SRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldrsw(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsw(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void str(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldr(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void str(ARMEmitter::DRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldr(ARMEmitter::DRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void str(ARMEmitter::QRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } template requires (Index == IndexType::POST || Index == IndexType::PRE) void ldr(ARMEmitter::QRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } // Loadstore register unprivileged void sttrb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrb(rt, rn, Imm); } void ldtrb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrb(rt, rn, Imm); } void ldtrsb(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsb(rt, rn, Imm); } void ldtrsb(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsb(rt, rn, Imm); } void sttrh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXrh(rt, rn, Imm); } void ldtrh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrh(rt, rn, Imm); } void ldtrsh(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsh(rt, rn, Imm); } void ldtrsh(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsh(rt, rn, Imm); } void sttr(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldtr(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } void ldtrsw(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXrsw(rt, rn, Imm); } void sttr(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { stXr(rt, rn, Imm); } void ldtr(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm = 0) { ldXr(rt, rn, Imm); } // Atomic memory operations void stadd(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b000, rs, Reg::zr, rn); } void staddl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b000, rs, Reg::zr, rn); } void stadda(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b000, rs, Reg::zr, rn); } void staddal(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b000, rs, Reg::zr, rn); } void stclr(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b001, rs, Reg::zr, rn); } void stclrl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b001, rs, Reg::zr, rn); } void stclra(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b001, rs, Reg::zr, rn); } void stclral(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b001, rs, Reg::zr, rn); } void stset(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b011, rs, Reg::zr, rn); } void stsetl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b011, rs, Reg::zr, rn); } void stseta(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b011, rs, Reg::zr, rn); } void stsetal(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b011, rs, Reg::zr, rn); } void steor(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b010, rs, Reg::zr, rn); } void steorl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b010, rs, Reg::zr, rn); } void steora(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b010, rs, Reg::zr, rn); } void steoral(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b010, rs, Reg::zr, rn); } void stsmax(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b100, rs, Reg::zr, rn); } void stsmaxl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b100, rs, Reg::zr, rn); } void stsmaxa(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b100, rs, Reg::zr, rn); } void stsmaxal(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b100, rs, Reg::zr, rn); } void stsmin(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b101, rs, Reg::zr, rn); } void stsminl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b101, rs, Reg::zr, rn); } void stsmina(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b101, rs, Reg::zr, rn); } void stsminal(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b101, rs, Reg::zr, rn); } void stumax(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b110, rs, Reg::zr, rn); } void stumaxl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b110, rs, Reg::zr, rn); } void stumaxa(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b110, rs, Reg::zr, rn); } void stumaxal(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b110, rs, Reg::zr, rn); } void stumin(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b111, rs, Reg::zr, rn); } void stuminl(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b111, rs, Reg::zr, rn); } void stumina(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b111, rs, Reg::zr, rn); } void stuminal(SubRegSize size, Register rs, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b111, rs, Reg::zr, rn); } void ldswp(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 1, 0b000, rs, rt, rn); } void ldswpl(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 1, 0b000, rs, rt, rn); } void ldswpa(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 1, 0b000, rs, rt, rn); } void ldswpal(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 1, 0b000, rs, rt, rn); } void ldadd(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b000, rs, rt, rn); } void ldadda(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b000, rs, rt, rn); } void ldaddl(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b000, rs, rt, rn); } void ldaddal(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b000, rs, rt, rn); } void ldclr(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b001, rs, rt, rn); } void ldclra(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b001, rs, rt, rn); } void ldclrl(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b001, rs, rt, rn); } void ldclral(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b001, rs, rt, rn); } void ldset(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b011, rs, rt, rn); } void ldseta(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b011, rs, rt, rn); } void ldsetl(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b011, rs, rt, rn); } void ldsetal(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b011, rs, rt, rn); } void ldeor(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 0, 0, 0b010, rs, rt, rn); } void ldeora(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 0, 0, 0b010, rs, rt, rn); } void ldeorl(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 0, 1, 0, 0b010, rs, rt, rn); } void ldeoral(SubRegSize size, Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(size, 1, 1, 0, 0b010, rs, rt, rn); } // 8-bit void ldaddb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b000, rs, rt, rn); } void ldclrb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b001, rs, rt, rn); } void ldeorb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b010, rs, rt, rn); } void ldsetb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b011, rs, rt, rn); } void ldsmaxb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b100, rs, rt, rn); } void ldsminb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b101, rs, rt, rn); } void ldumaxb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b110, rs, rt, rn); } void lduminb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 0, 0b111, rs, rt, rn); } void ldswpb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 0, 1, 0b000, rs, rt, rn); } void ldaddlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b000, rs, rt, rn); } void ldclrlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b001, rs, rt, rn); } void ldeorlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b010, rs, rt, rn); } void ldsetlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b011, rs, rt, rn); } void ldsmaxlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b100, rs, rt, rn); } void ldsminlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b101, rs, rt, rn); } void ldumaxlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b110, rs, rt, rn); } void lduminlb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 0, 0b111, rs, rt, rn); } void ldswplb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 0, 1, 1, 0b000, rs, rt, rn); } void ldaddab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b000, rs, rt, rn); } void ldclrab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b001, rs, rt, rn); } void ldeorab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b010, rs, rt, rn); } void ldsetab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b011, rs, rt, rn); } void ldsmaxab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b100, rs, rt, rn); } void ldsminab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b101, rs, rt, rn); } void ldumaxab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b110, rs, rt, rn); } void lduminab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 0, 0b111, rs, rt, rn); } void ldswpab(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 1, 0b000, rs, rt, rn); } void ldaddalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b000, rs, rt, rn); } void ldclralb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b001, rs, rt, rn); } void ldeoralb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b010, rs, rt, rn); } void ldsetalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b011, rs, rt, rn); } void ldsmaxalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b100, rs, rt, rn); } void ldsminalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b101, rs, rt, rn); } void ldumaxalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b110, rs, rt, rn); } void lduminalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 0, 0b111, rs, rt, rn); } void ldswpalb(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 1, 1, 0b000, rs, rt, rn); } // 16-bit void ldaddh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b000, rs, rt, rn); } void ldclrh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b001, rs, rt, rn); } void ldeorh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b010, rs, rt, rn); } void ldseth(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b011, rs, rt, rn); } void ldsmaxh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b100, rs, rt, rn); } void ldsminh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b101, rs, rt, rn); } void ldumaxh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b110, rs, rt, rn); } void lduminh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 0, 0b111, rs, rt, rn); } void ldswph(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 0, 1, 0b000, rs, rt, rn); } void ldaddlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b000, rs, rt, rn); } void ldclrlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b001, rs, rt, rn); } void ldeorlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b010, rs, rt, rn); } void ldsetlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b011, rs, rt, rn); } void ldsmaxlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b100, rs, rt, rn); } void ldsminlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b101, rs, rt, rn); } void ldumaxlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b110, rs, rt, rn); } void lduminlh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 0, 0b111, rs, rt, rn); } void ldswplh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 0, 1, 1, 0b000, rs, rt, rn); } void ldaddah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b000, rs, rt, rn); } void ldclrah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b001, rs, rt, rn); } void ldeorah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b010, rs, rt, rn); } void ldsetah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b011, rs, rt, rn); } void ldsmaxah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b100, rs, rt, rn); } void ldsminah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b101, rs, rt, rn); } void ldumaxah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b110, rs, rt, rn); } void lduminah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 0, 0b111, rs, rt, rn); } void ldswpah(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 1, 0b000, rs, rt, rn); } void ldaddalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b000, rs, rt, rn); } void ldclralh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b001, rs, rt, rn); } void ldeoralh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b010, rs, rt, rn); } void ldsetalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b011, rs, rt, rn); } void ldsmaxalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b100, rs, rt, rn); } void ldsminalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b101, rs, rt, rn); } void ldumaxalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b110, rs, rt, rn); } void lduminalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 0, 0b111, rs, rt, rn); } void ldswpalh(Register rs, Register rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 1, 1, 0b000, rs, rt, rn); } // 32-bit void ldadd(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b000, rs, rt, rn); } void ldclr(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b001, rs, rt, rn); } void ldeor(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b010, rs, rt, rn); } void ldset(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b011, rs, rt, rn); } void ldsmax(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b100, rs, rt, rn); } void ldsmin(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b101, rs, rt, rn); } void ldumax(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b110, rs, rt, rn); } void ldumin(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 0, 0b111, rs, rt, rn); } void ldswp(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 0, 1, 0b000, rs, rt, rn); } void ldaddl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b000, rs, rt, rn); } void ldclrl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b001, rs, rt, rn); } void ldeorl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b010, rs, rt, rn); } void ldsetl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b011, rs, rt, rn); } void ldsmaxl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b100, rs, rt, rn); } void ldsminl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b101, rs, rt, rn); } void ldumaxl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b110, rs, rt, rn); } void lduminl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 0, 0b111, rs, rt, rn); } void ldswpl(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 0, 1, 1, 0b000, rs, rt, rn); } void ldadda(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b000, rs, rt, rn); } void ldclra(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b001, rs, rt, rn); } void ldeora(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b010, rs, rt, rn); } void ldseta(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b011, rs, rt, rn); } void ldsmaxa(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b100, rs, rt, rn); } void ldsmina(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b101, rs, rt, rn); } void ldumaxa(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b110, rs, rt, rn); } void ldumina(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 0, 0b111, rs, rt, rn); } void ldswpa(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 1, 0b000, rs, rt, rn); } void ldaddal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b000, rs, rt, rn); } void ldclral(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b001, rs, rt, rn); } void ldeoral(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b010, rs, rt, rn); } void ldsetal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b011, rs, rt, rn); } void ldsmaxal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b100, rs, rt, rn); } void ldsminal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b101, rs, rt, rn); } void ldumaxal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b110, rs, rt, rn); } void lduminal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 0, 0b111, rs, rt, rn); } void ldswpal(WRegister rs, WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 1, 1, 0b000, rs, rt, rn); } // 64-bit void ldadd(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b000, rs, rt, rn); } void ldclr(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b001, rs, rt, rn); } void ldeor(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b010, rs, rt, rn); } void ldset(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b011, rs, rt, rn); } void ldsmax(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b100, rs, rt, rn); } void ldsmin(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b101, rs, rt, rn); } void ldumax(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b110, rs, rt, rn); } void ldumin(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 0, 0b111, rs, rt, rn); } void ldswp(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 1, 0b000, rs, rt, rn); } void ldaddl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b000, rs, rt, rn); } void ldclrl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b001, rs, rt, rn); } void ldeorl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b010, rs, rt, rn); } void ldsetl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b011, rs, rt, rn); } void ldsmaxl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b100, rs, rt, rn); } void ldsminl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b101, rs, rt, rn); } void ldumaxl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b110, rs, rt, rn); } void lduminl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 0, 0b111, rs, rt, rn); } void ldswpl(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 1, 1, 0b000, rs, rt, rn); } void ldadda(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b000, rs, rt, rn); } void ldclra(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b001, rs, rt, rn); } void ldeora(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b010, rs, rt, rn); } void ldseta(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b011, rs, rt, rn); } void ldsmaxa(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b100, rs, rt, rn); } void ldsmina(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b101, rs, rt, rn); } void ldumaxa(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b110, rs, rt, rn); } void ldumina(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 0, 0b111, rs, rt, rn); } void ldswpa(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 1, 0b000, rs, rt, rn); } void ldaddal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b000, rs, rt, rn); } void ldclral(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b001, rs, rt, rn); } void ldeoral(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b010, rs, rt, rn); } void ldsetal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b011, rs, rt, rn); } void ldsmaxal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b100, rs, rt, rn); } void ldsminal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b101, rs, rt, rn); } void ldumaxal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b110, rs, rt, rn); } void lduminal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 0, 0b111, rs, rt, rn); } void ldswpal(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 1, 1, 0b000, rs, rt, rn); } void ldaprb(WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i8Bit, 1, 0, 1, 0b100, WReg::w31, rt, rn); } void ldaprh(WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i16Bit, 1, 0, 1, 0b100, WReg::w31, rt, rn); } void ldapr(WRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i32Bit, 1, 0, 1, 0b100, WReg::w31, rt, rn); } void ldapr(XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 1, 0, 1, 0b100, XReg::x31, rt, rn); } void st64bv0(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 1, 0b010, rs, rt, rn); } void st64bv(XRegister rs, XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 1, 0b011, rs, rt, rn); } void st64b(XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 1, 0b001, XReg::x31, rt, rn); } void ld64b(XRegister rt, Register rn) { LoadStoreAtomicLSE(SubRegSize::i64Bit, 0, 0, 1, 0b101, XReg::x31, rt, rn); } // Loadstore register-register offset void strb(ARMEmitter::Register rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, bool Shift = false) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); constexpr uint32_t Op = 0b0011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrb(ARMEmitter::Register rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, bool Shift = false) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); constexpr uint32_t Op = 0b0011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrsb(ARMEmitter::XRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, bool Shift = false) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); constexpr uint32_t Op = 0b0011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b10, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrsb(ARMEmitter::WRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, bool Shift = false) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); constexpr uint32_t Op = 0b0011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b11, rt, rn, rm, Option, Shift ? 1 : 0); } void strh(ARMEmitter::Register rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 1, "Unsupported shift amount"); constexpr uint32_t Op = 0b0111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrh(ARMEmitter::Register rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 1, "Unsupported shift amount"); constexpr uint32_t Op = 0b0111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrsh(ARMEmitter::XRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 1, "Unsupported shift amount"); constexpr uint32_t Op = 0b0111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b10, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrsh(ARMEmitter::WRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 1, "Unsupported shift amount"); constexpr uint32_t Op = 0b0111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b11, rt, rn, rm, Option, Shift ? 1 : 0); } void str(ARMEmitter::WRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 2, "Unsupported shift amount"); constexpr uint32_t Op = 0b1011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldr(ARMEmitter::WRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 2, "Unsupported shift amount: {}", Shift); constexpr uint32_t Op = 0b1011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrsw(ARMEmitter::XRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 2, "Unsupported shift amount"); constexpr uint32_t Op = 0b1011'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b10, rt, rn, rm, Option, Shift ? 1 : 0); } void str(ARMEmitter::XRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 3, "Unsupported shift amount"); constexpr uint32_t Op = 0b1111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldr(ARMEmitter::XRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 3, "Unsupported shift amount"); constexpr uint32_t Op = 0b1111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void prfm(ARMEmitter::Prefetch prfop, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 3, "Unsupported shift amount"); constexpr uint32_t Op = 0b1111'1000'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b10, prfop, rn, rm, Option, Shift ? 1 : 0); } void strb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); constexpr uint32_t Op = 0b0011'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, 0); } void ldrb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); constexpr uint32_t Op = 0b0011'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, 0); } void strh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 1, "Unsupported shift amount"); constexpr uint32_t Op = 0b0111'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldrh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 1, "Unsupported shift amount"); constexpr uint32_t Op = 0b0111'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void str(ARMEmitter::SRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 2, "Unsupported shift amount"); constexpr uint32_t Op = 0b1011'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldr(ARMEmitter::SRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 2, "Unsupported shift amount"); constexpr uint32_t Op = 0b1011'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void str(ARMEmitter::DRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 3, "Unsupported shift amount"); constexpr uint32_t Op = 0b1111'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b00, rt, rn, rm, Option, Shift ? 1 : 0); } void ldr(ARMEmitter::DRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 3, "Unsupported shift amount"); constexpr uint32_t Op = 0b1111'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b01, rt, rn, rm, Option, Shift ? 1 : 0); } void str(ARMEmitter::QRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 4, "Unsupported shift amount"); constexpr uint32_t Op = 0b0011'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b10, rt, rn, rm, Option, Shift ? 1 : 0); } void ldr(ARMEmitter::QRegister rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { LOGMAN_THROW_A_FMT((FEXCore::ToUnderlying(Option) & 0b010) == 0b010, "Unsupported Extendtype"); LOGMAN_THROW_A_FMT(Shift == 0 || Shift == 4, "Unsupported shift amount"); constexpr uint32_t Op = 0b0011'1100'001 << 21 | (0b10 << 10); LoadStoreRegisterOffset(Op, 0b11, rt, rn, rm, Option, Shift ? 1 : 0); } void strb(ARMEmitter::Register rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { strb(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { strb(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if (MemSrc.MetaType.ImmType.Imm < 0) { sturb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { strb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { strb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { strb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrb(ARMEmitter::Register rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrb(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrb(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if (MemSrc.MetaType.ImmType.Imm < 0) { ldurb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrsb(ARMEmitter::XRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrsb(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if (MemSrc.MetaType.ImmType.Imm < 0) { ldursb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrsb(ARMEmitter::WRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrsb(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if (MemSrc.MetaType.ImmType.Imm < 0) { ldursb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrsb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void strh(ARMEmitter::Register rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { strh(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { strh(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1) || MemSrc.MetaType.ImmType.Imm < 0) { sturh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { strh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { strh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { strh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrh(ARMEmitter::Register rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrh(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrh(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1) || MemSrc.MetaType.ImmType.Imm < 0) { ldurh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrsh(ARMEmitter::XRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrsh(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1) || MemSrc.MetaType.ImmType.Imm < 0) { ldursh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrsh(ARMEmitter::WRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrsh(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1) || MemSrc.MetaType.ImmType.Imm < 0) { ldursh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrsh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void str(ARMEmitter::WRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { str(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { str(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b11) || MemSrc.MetaType.ImmType.Imm < 0) { stur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldr(ARMEmitter::WRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldr(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldr(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b11) || MemSrc.MetaType.ImmType.Imm < 0) { ldur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrsw(ARMEmitter::XRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrsw(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrsw(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b11) || MemSrc.MetaType.ImmType.Imm < 0) { ldursw(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrsw(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrsw(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrsw(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void str(ARMEmitter::XRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { str(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { str(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b111) || MemSrc.MetaType.ImmType.Imm < 0) { stur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldr(ARMEmitter::XRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldr(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldr(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b111) || MemSrc.MetaType.ImmType.Imm < 0) { ldur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void prfm(ARMEmitter::Prefetch prfop, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { prfm(prfop, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { prfm(prfop, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b111) || MemSrc.MetaType.ImmType.Imm < 0) { prfum(prfop, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { prfm(prfop, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void strb(ARMEmitter::VRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { LOGMAN_THROW_A_FMT(MemSrc.MetaType.Extended.Shift == false, "Can't shift byte"); strb(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { strb(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if (MemSrc.MetaType.ImmType.Imm < 0) { sturb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { strb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { strb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { strb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrb(ARMEmitter::VRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { LOGMAN_THROW_A_FMT(MemSrc.MetaType.Extended.Shift == false, "Can't shift byte"); ldrb(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrb(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if (MemSrc.MetaType.ImmType.Imm < 0) { ldurb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrb(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void strh(ARMEmitter::VRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { strh(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { strh(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1) || MemSrc.MetaType.ImmType.Imm < 0) { sturh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { strh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { strh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { strh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldrh(ARMEmitter::VRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldrh(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldrh(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1) || MemSrc.MetaType.ImmType.Imm < 0) { ldurh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldrh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldrh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldrh(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void str(ARMEmitter::SRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { str(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { str(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b11) || MemSrc.MetaType.ImmType.Imm < 0) { stur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldr(ARMEmitter::SRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldr(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldr(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b11) || MemSrc.MetaType.ImmType.Imm < 0) { ldur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void str(ARMEmitter::DRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { str(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { str(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b111) || MemSrc.MetaType.ImmType.Imm < 0) { stur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldr(ARMEmitter::DRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldr(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldr(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b111) || MemSrc.MetaType.ImmType.Imm < 0) { ldur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void str(ARMEmitter::QRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { str(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { str(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1111) || MemSrc.MetaType.ImmType.Imm < 0) { stur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { str(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } void ldr(ARMEmitter::QRegister rt, ARMEmitter::ExtendedMemOperand MemSrc) { if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED && MemSrc.MetaType.Extended.rm.Idx() != ARMEmitter::Reg::r31.Idx()) { ldr(rt, MemSrc.rn, MemSrc.MetaType.Extended.rm, MemSrc.MetaType.Extended.Option, MemSrc.MetaType.Extended.Shift); } else if (MemSrc.MetaType.Header.MemType == ARMEmitter::ExtendedMemOperand::Type::TYPE_EXTENDED) { ldr(rt, MemSrc.rn); } else { if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) { if ((MemSrc.MetaType.ImmType.Imm & 0b1111) || MemSrc.MetaType.ImmType.Imm < 0) { ldur(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::POST) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::PRE) { ldr(rt, MemSrc.rn, MemSrc.MetaType.ImmType.Imm); } else { LOGMAN_MSG_A_FMT("Unexpected loadstore index type"); FEX_UNREACHABLE; } } } // Loadstore PAC void ldraa(XRegister rt, XRegister rn, IndexType type, int32_t offset = 0) { LoadStorePAC(0b11, 0, 0, offset, type, rn, rt); } void ldrab(XRegister rt, XRegister rn, IndexType type, int32_t offset = 0) { LoadStorePAC(0b11, 0, 1, offset, type, rn, rt); } // Loadstore unsigned immediate // Maximum values of unsigned immediate offsets for particular data sizes. static constexpr uint32_t LSByteMaxUnsignedOffset = 4095; static constexpr uint32_t LSHalfMaxUnsignedOffset = 8190; static constexpr uint32_t LSWordMaxUnsignedOffset = 16380; static constexpr uint32_t LSDWordMaxUnsignedOffset = 32760; static constexpr uint32_t LSQWordMaxUnsignedOffset = 65520; void strb(Register rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 0, 0b00, rt, rn, Imm); } void ldrb(Register rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 0, 0b01, rt, rn, Imm); } void ldrsb(XRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 0, 0b10, rt, rn, Imm); } void ldrsb(WRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 0, 0b11, rt, rn, Imm); } void strb(VRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 1, 0b00, rt, rn, Imm); } void ldrb(VRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 1, 0b01, rt, rn, Imm); } void strh(Register rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b01, 0, 0b00, rt, rn, Imm); } void ldrh(Register rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b01, 0, 0b01, rt, rn, Imm); } void ldrsh(XRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b01, 0, 0b10, rt, rn, Imm); } void ldrsh(WRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b01, 0, 0b11, rt, rn, Imm); } void strh(VRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b01, 1, 0b00, rt, rn, Imm); } void ldrh(VRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b01, 1, 0b01, rt, rn, Imm); } void str(WRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b10, 0, 0b00, rt, rn, Imm); } void ldr(WRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b10, 0, 0b01, rt, rn, Imm); } void ldrsw(XRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b10, 0, 0b10, rt, rn, Imm); } void str(SRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b10, 1, 0b00, rt, rn, Imm); } void ldr(SRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b10, 1, 0b01, rt, rn, Imm); } void str(XRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b11, 0, 0b00, rt, rn, Imm); } void ldr(XRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b11, 0, 0b01, rt, rn, Imm); } void ldr(SubRegSize size, Register rt, Register rn, uint32_t Imm = 0) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LoadStoreUnsigned(FEXCore::ToUnderlying(size), 0, 0b01, rt, rn, Imm); } void str(SubRegSize size, Register rt, Register rn, uint32_t Imm = 0) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LoadStoreUnsigned(FEXCore::ToUnderlying(size), 0, 0b00, rt, rn, Imm); } void prfm(Prefetch prfop, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b11, 0, 0b10, prfop, rn, Imm); } void str(DRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b11, 1, 0b00, rt, rn, Imm); } void ldr(DRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b11, 1, 0b01, rt, rn, Imm); } void str(QRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 1, 0b10, rt, rn, Imm); } void ldr(QRegister rt, Register rn, uint32_t Imm = 0) { LoadStoreUnsigned(0b00, 1, 0b11, rt, rn, Imm); } private: void AtomicOp(uint32_t Op, ARMEmitter::Size s, uint32_t L, uint32_t o0, ARMEmitter::Register rs, ARMEmitter::Register rt, ARMEmitter::Register rt2, ARMEmitter::Register rn) { const uint32_t sz = s == ARMEmitter::Size::i64Bit ? (1U << 30) : 0; uint32_t Instr = Op; Instr |= sz; Instr |= L << 22; Instr |= Encode_rs(rs); Instr |= o0 << 15; Instr |= Encode_rt2(rt2); Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } template void SubAtomicOp(uint32_t Op, ARMEmitter::SubRegSize s, uint32_t L, uint32_t o0, T rs, T rt, T rt2, ARMEmitter::Register rn) { const uint32_t sz = FEXCore::ToUnderlying(s) << 30; uint32_t Instr = Op; Instr |= sz; Instr |= L << 22; Instr |= Encode_rs(rs); Instr |= o0 << 15; Instr |= Encode_rt2(rt2); Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } template void SubAtomicImm(uint32_t Op, ARMEmitter::SubRegSize s, uint32_t opc, T rt, ARMEmitter::Register rn, uint32_t Imm) { const uint32_t sz = FEXCore::ToUnderlying(s) << 30; uint32_t Instr = Op; Instr |= sz; Instr |= opc << 22; Instr |= Imm << 12; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } // Load register literal template void LoadStoreLiteral(uint32_t Op, T rt, uint32_t Imm) { uint32_t Instr = Op; Instr |= Imm << 5; Instr |= Encode_rt(rt); dc32(Instr); } void MemoryCopyAndMemorySet(uint32_t sz, uint32_t o0, uint32_t op1, uint32_t op2, Register rs, Register rn, Register rd) { uint32_t Instr = 0b0001'1001'0000'0000'0000'0100'0000'0000; Instr |= sz << 30; Instr |= o0 << 26; Instr |= op1 << 22; Instr |= rs.Idx() << 16; Instr |= op2 << 12; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Loadstore no-allocate pair template void LoadStoreNoAllocate(uint32_t Op, T rt, T rt2, ARMEmitter::Register rn, uint32_t Imm) { uint32_t Instr = Op; Instr |= Imm << 15; Instr |= Encode_rt2(rt2); Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } // Loadstore register pair post-indexed template void LoadStorePair(uint32_t Op, T rt, T rt2, ARMEmitter::Register rn, uint32_t Imm) { uint32_t Instr = Op; Instr |= Imm << 15; Instr |= Encode_rt2(rt2); Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } // Loadstore register unscaled immediate // Loadstore register immediate post-indexed // Loadstore register unprivileged // Loadstore register immediate pre-indexed template void LoadStoreImm(uint32_t Op, uint32_t o2, T rt, ARMEmitter::Register rn, uint32_t Imm) { uint32_t Instr = Op; Instr |= Imm << 12; Instr |= o2 << 10; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } // Atomic memory operations void LoadStoreAtomicLSE(SubRegSize s, uint32_t A, uint32_t R, uint32_t o3, uint32_t opc, Register rs, Register rt, Register rn) { uint32_t Instr = 0b0011'1000'0010'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(s) << 30; Instr |= A << 23; Instr |= R << 22; Instr |= Encode_rs(rs); Instr |= o3 << 15; Instr |= opc << 12; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } // Loadstore register-register offset template void LoadStoreRegisterOffset(uint32_t Op, uint32_t opc, T rt, ARMEmitter::Register rn, ARMEmitter::Register rm, ARMEmitter::ExtendedType Option, uint32_t Shift) { uint32_t Instr = Op; Instr |= opc << 22; Instr |= Encode_rt(rt); Instr |= FEXCore::ToUnderlying(Option) << 13; Instr |= Shift << 12; Instr |= Encode_rn(rn); Instr |= Encode_rm(rm); dc32(Instr); } void LoadStorePAC(uint32_t size, uint32_t VR, uint32_t M, int32_t imm, IndexType type, Register rn, Register rt) { LOGMAN_THROW_A_FMT((imm % 8) == 0, "Immediate ({}) must be divisible by 8", imm); LOGMAN_THROW_A_FMT(imm >= -4096 && imm <= 4088, "Immediate ({}) must be within [-4096, 4088]", imm); LOGMAN_THROW_A_FMT(type == IndexType::OFFSET || type == IndexType::PRE, "PAC may only use offset or pre-indexed values"); // The immediate is scaled down in order to fit within the available 10 immediate bits. const auto scaled_imm = static_cast(imm / 8); const auto imm9 = scaled_imm & 0b1'1111'1111; const auto S = (scaled_imm >> 9) & 1; const auto W = type == IndexType::OFFSET ? 0U : 1U; uint32_t Instr = 0b0011'1000'0010'0000'0000'0100'0000'0000; Instr |= size << 30; Instr |= VR << 26; Instr |= M << 23; Instr |= S << 22; Instr |= imm9 << 12; Instr |= W << 11; Instr |= rn.Idx() << 5; Instr |= rt.Idx(); dc32(Instr); } // Loadstore unsigned immediate template void LoadStoreUnsigned(uint32_t size, uint32_t V, uint32_t opc, T rt, Register rn, uint32_t Imm) { uint32_t SizeShift = size; if constexpr (std::is_same_v) { // 128-bit variant is specified via size=0b00, V=1, opc=0b1x // so we need to special case this one based on whether or not // rt indicates a 128-bit vector. Nice thing is this can be // checked at compile-time. SizeShift = 4; } [[maybe_unused]] const uint32_t MaxImm = LSByteMaxUnsignedOffset << SizeShift; [[maybe_unused]] const uint32_t ElementSize = 1U << SizeShift; LOGMAN_THROW_A_FMT(Imm <= MaxImm, "{}: Offset not valid: Imm: 0x{:x} Max: 0x{:x}", __func__, Imm, MaxImm); LOGMAN_THROW_A_FMT((Imm % ElementSize) == 0, "{}: Offset must be a multiple of {}. Offset: 0x{:x}", __func__, ElementSize, Imm); const uint32_t ShiftedImm = Imm >> SizeShift; uint32_t Instr = 0b0011'1001'0000'0000'0000'0000'0000'0000; Instr |= size << 30; Instr |= V << 26; Instr |= opc << 22; Instr |= ShiftedImm << 10; Instr |= Encode_rn(rn); Instr |= Encode_rt(rt); dc32(Instr); } template void ldp_w(ARMEmitter::VRegister rt, ARMEmitter::VRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0010'1100'01 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 2) & 0b111'1111); } template void ldp_x(ARMEmitter::VRegister rt, ARMEmitter::VRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0110'1100'01 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 3) & 0b111'1111); } template void stp_w(ARMEmitter::VRegister rt, ARMEmitter::VRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 252 && ((Imm & 0b11) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0010'1100'00 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 2) & 0b111'1111); } template void stp_x(ARMEmitter::VRegister rt, ARMEmitter::VRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -512 && Imm <= 504 && ((Imm & 0b111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b0110'1100'00 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 3) & 0b111'1111); } template void ldp_q(ARMEmitter::VRegister rt, ARMEmitter::VRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1024 && Imm <= 1008 && ((Imm & 0b1111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b1010'1100'01 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 4) & 0b111'1111); } template void stp_q(ARMEmitter::VRegister rt, ARMEmitter::VRegister rt2, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -1024 && Imm <= 1008 && ((Imm & 0b1111) == 0), "Unscaled offset too large"); constexpr uint32_t Op = (0b1010'1100'00 << 22) | (Index == IndexType::POST ? (0b01 << 23) : Index == IndexType::PRE ? (0b11 << 23) : Index == IndexType::OFFSET ? (0b10 << 23) : -1); LoadStorePair(Op, rt, rt2, rn, (Imm >> 4) & 0b111'1111); } template void stXrb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1000'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrb(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1000'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXrb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1100'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrb(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1100'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrsb(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1000'10 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrsb(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1000'11 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXrh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0111'1000'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrh(ARMEmitter::Register rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0111'1000'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXrh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0111'1100'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrh(ARMEmitter::VRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0111'1100'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrsh(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0111'1000'10 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrsh(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0111'1000'11 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXr(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1011'1000'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXr(ARMEmitter::WRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1011'1000'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXr(ARMEmitter::SRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1011'1100'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXr(ARMEmitter::SRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1011'1100'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXrsw(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1011'1000'10 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXr(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1111'1000'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXr(ARMEmitter::XRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1111'1000'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXr(ARMEmitter::DRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1111'1100'00 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXr(ARMEmitter::DRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b1111'1100'01 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void stXr(ARMEmitter::QRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1100'10 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } template void ldXr(ARMEmitter::QRegister rt, ARMEmitter::Register rn, int32_t Imm) { LOGMAN_THROW_A_FMT(Imm >= -256 && Imm <= 255, "Unscaled offset too large"); constexpr uint32_t Op = 0b0011'1100'11 << 22; constexpr uint32_t o2 = Index == IndexType::POST ? 0b01 : Index == IndexType::PRE ? 0b11 : Index == IndexType::OFFSET ? 0b00 : Index == IndexType::UNPRIVILEGED ? 0b10 : -1; LoadStoreImm(Op, o2, rt, rn, Imm & 0b1'1111'1111); } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/Registers.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include namespace ARMEmitter { class WRegister; class XRegister; /* Unsized GPR register class * This class doesn't imply a size when used */ class Register { public: Register() = delete; constexpr explicit Register(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const Register&, const Register&) = default; constexpr uint32_t Idx() const { return Index; } constexpr WRegister W() const; constexpr XRegister X() const; private: uint32_t Index; }; static_assert(sizeof(Register) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 32-bit GPR register class. * This class will imply a 32-bit register size being used. */ class WRegister { public: WRegister() = delete; constexpr explicit WRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const WRegister&, const WRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator Register() const { return Register(Index); } constexpr XRegister X() const; constexpr Register R() const; private: uint32_t Index; }; static_assert(sizeof(WRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 64-bit GPR register class. * This class will imply a 64-bit register size being used. */ class XRegister { public: XRegister() = delete; constexpr explicit XRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const XRegister&, const XRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator Register() const { return Register(Index); } constexpr WRegister W() const; constexpr Register R() const; private: uint32_t Index; }; static_assert(sizeof(XRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); inline constexpr WRegister Register::W() const { return WRegister {Index}; } inline constexpr XRegister Register::X() const { return XRegister {Index}; } inline constexpr XRegister WRegister::X() const { return XRegister {Index}; } inline constexpr Register WRegister::R() const { return *this; } inline constexpr WRegister XRegister::W() const { return WRegister {Index}; } inline constexpr Register XRegister::R() const { return *this; } // Namespace containing all unsized GPR register objects. namespace Reg { constexpr static Register r0(0); constexpr static Register r1(1); constexpr static Register r2(2); constexpr static Register r3(3); constexpr static Register r4(4); constexpr static Register r5(5); constexpr static Register r6(6); constexpr static Register r7(7); constexpr static Register r8(8); constexpr static Register r9(9); constexpr static Register r10(10); constexpr static Register r11(11); constexpr static Register r12(12); constexpr static Register r13(13); constexpr static Register r14(14); constexpr static Register r15(15); constexpr static Register r16(16); constexpr static Register r17(17); constexpr static Register r18(18); constexpr static Register r19(19); constexpr static Register r20(20); constexpr static Register r21(21); constexpr static Register r22(22); constexpr static Register r23(23); constexpr static Register r24(24); constexpr static Register r25(25); constexpr static Register r26(26); constexpr static Register r27(27); constexpr static Register r28(28); constexpr static Register r29(29); constexpr static Register r30(30); constexpr static Register r31(31); // Named registers constexpr static Register ip0(16); constexpr static Register ip1(17); constexpr static Register fp(29); constexpr static Register lr(30); constexpr static Register rsp(31); constexpr static Register zr(31); } // namespace Reg // Namespace containing all 64-bit GPR register objects. namespace XReg { constexpr static XRegister x0(0); constexpr static XRegister x1(1); constexpr static XRegister x2(2); constexpr static XRegister x3(3); constexpr static XRegister x4(4); constexpr static XRegister x5(5); constexpr static XRegister x6(6); constexpr static XRegister x7(7); constexpr static XRegister x8(8); constexpr static XRegister x9(9); constexpr static XRegister x10(10); constexpr static XRegister x11(11); constexpr static XRegister x12(12); constexpr static XRegister x13(13); constexpr static XRegister x14(14); constexpr static XRegister x15(15); constexpr static XRegister x16(16); constexpr static XRegister x17(17); constexpr static XRegister x18(18); constexpr static XRegister x19(19); constexpr static XRegister x20(20); constexpr static XRegister x21(21); constexpr static XRegister x22(22); constexpr static XRegister x23(23); constexpr static XRegister x24(24); constexpr static XRegister x25(25); constexpr static XRegister x26(26); constexpr static XRegister x27(27); constexpr static XRegister x28(28); constexpr static XRegister x29(29); constexpr static XRegister x30(30); constexpr static XRegister x31(31); // Named registers constexpr static XRegister ip0(16); constexpr static XRegister ip1(17); constexpr static XRegister fp(29); constexpr static XRegister lr(30); constexpr static XRegister rsp(31); constexpr static XRegister zr(31); } // namespace XReg // Namespace containing all 32-bit GPR register objects. namespace WReg { constexpr static WRegister w0(0); constexpr static WRegister w1(1); constexpr static WRegister w2(2); constexpr static WRegister w3(3); constexpr static WRegister w4(4); constexpr static WRegister w5(5); constexpr static WRegister w6(6); constexpr static WRegister w7(7); constexpr static WRegister w8(8); constexpr static WRegister w9(9); constexpr static WRegister w10(10); constexpr static WRegister w11(11); constexpr static WRegister w12(12); constexpr static WRegister w13(13); constexpr static WRegister w14(14); constexpr static WRegister w15(15); constexpr static WRegister w16(16); constexpr static WRegister w17(17); constexpr static WRegister w18(18); constexpr static WRegister w19(19); constexpr static WRegister w20(20); constexpr static WRegister w21(21); constexpr static WRegister w22(22); constexpr static WRegister w23(23); constexpr static WRegister w24(24); constexpr static WRegister w25(25); constexpr static WRegister w26(26); constexpr static WRegister w27(27); constexpr static WRegister w28(28); constexpr static WRegister w29(29); constexpr static WRegister w30(30); constexpr static WRegister w31(31); // Named registers constexpr static WRegister ip0(16); constexpr static WRegister ip1(17); constexpr static WRegister fp(29); constexpr static WRegister lr(30); constexpr static WRegister rsp(31); constexpr static WRegister zr(31); } // namespace WReg class VRegister; class BRegister; class HRegister; class SRegister; class DRegister; class QRegister; class ZRegister; /* Unsized ASIMD register class * This class doesn't imply a size when used, nor implies Vector or Scalar. * It does imply that this instruction isn't using the register for SVE. */ class VRegister { public: VRegister() = delete; constexpr explicit VRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const VRegister&, const VRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr BRegister B() const; constexpr HRegister H() const; constexpr SRegister S() const; constexpr DRegister D() const; constexpr QRegister Q() const; constexpr ZRegister Z() const; private: uint32_t Index; }; static_assert(sizeof(VRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 8-bit ASIMD register class * This class implies 8-bit scalar register. */ class BRegister { public: BRegister() = delete; constexpr explicit BRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const BRegister&, const BRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator VRegister() const { return VRegister(Index); } constexpr BRegister V() const; constexpr HRegister H() const; constexpr SRegister S() const; constexpr DRegister D() const; constexpr QRegister Q() const; constexpr ZRegister Z() const; private: uint32_t Index; }; static_assert(sizeof(BRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 16-bit ASIMD register class * This class implies 16-bit scalar register. */ class HRegister { public: HRegister() = delete; constexpr explicit HRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const HRegister&, const HRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator VRegister() const { return VRegister(Index); } constexpr HRegister V() const; constexpr BRegister B() const; constexpr SRegister S() const; constexpr DRegister D() const; constexpr QRegister Q() const; constexpr ZRegister Z() const; private: uint32_t Index; }; static_assert(sizeof(HRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 32-bit ASIMD register class * This class implies 32-bit scalar register. */ class SRegister { public: SRegister() = delete; constexpr explicit SRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const SRegister&, const SRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator VRegister() const { return VRegister(Index); } constexpr SRegister V() const; constexpr BRegister B() const; constexpr HRegister H() const; constexpr DRegister D() const; constexpr QRegister Q() const; constexpr ZRegister Z() const; private: uint32_t Index; }; static_assert(sizeof(SRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 64-bit ASIMD register class * This class doesn't imply Vector or Scalar. * Associated with operating the instruction at 64-bit. */ class DRegister { public: DRegister() = delete; constexpr explicit DRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const DRegister&, const DRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator VRegister() const { return VRegister(Index); } constexpr DRegister V() const; constexpr BRegister B() const; constexpr HRegister H() const; constexpr SRegister S() const; constexpr QRegister Q() const; constexpr ZRegister Z() const; private: uint32_t Index; }; static_assert(sizeof(DRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* 128-bit ASIMD register class * This class doesn't imply Vector or Scalar. * Associated with operating the instruction at 128-bit. */ class QRegister { public: QRegister() = delete; constexpr explicit QRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const QRegister&, const QRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator VRegister() const { return VRegister(Index); } constexpr QRegister V() const; constexpr BRegister B() const; constexpr HRegister H() const; constexpr SRegister S() const; constexpr DRegister D() const; constexpr ZRegister Z() const; private: uint32_t Index; }; static_assert(sizeof(QRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); /* Unsized SVE register class. * This class explicitly implies the instruction will operate using SVE. */ class ZRegister { public: ZRegister() = delete; constexpr explicit ZRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const ZRegister&, const ZRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr VRegister V() const; constexpr BRegister B() const; constexpr HRegister H() const; constexpr SRegister S() const; constexpr DRegister D() const; constexpr QRegister Q() const; private: uint32_t Index; }; static_assert(sizeof(ZRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); // VRegister inline constexpr BRegister VRegister::B() const { return BRegister {Index}; } inline constexpr HRegister VRegister::H() const { return HRegister {Index}; } inline constexpr SRegister VRegister::S() const { return SRegister {Index}; } inline constexpr DRegister VRegister::D() const { return DRegister {Index}; } inline constexpr QRegister VRegister::Q() const { return QRegister {Index}; } inline constexpr ZRegister VRegister::Z() const { return ZRegister {Index}; } // BRegister inline constexpr BRegister BRegister::V() const { return *this; } inline constexpr HRegister BRegister::H() const { return HRegister {Index}; } inline constexpr SRegister BRegister::S() const { return SRegister {Index}; } inline constexpr DRegister BRegister::D() const { return DRegister {Index}; } inline constexpr QRegister BRegister::Q() const { return QRegister {Index}; } inline constexpr ZRegister BRegister::Z() const { return ZRegister {Index}; } // HRegister inline constexpr HRegister HRegister::V() const { return *this; } inline constexpr BRegister HRegister::B() const { return BRegister {Index}; } inline constexpr SRegister HRegister::S() const { return SRegister {Index}; } inline constexpr DRegister HRegister::D() const { return DRegister {Index}; } inline constexpr QRegister HRegister::Q() const { return QRegister {Index}; } inline constexpr ZRegister HRegister::Z() const { return ZRegister {Index}; } // SRegister inline constexpr SRegister SRegister::V() const { return *this; } inline constexpr BRegister SRegister::B() const { return BRegister {Index}; } inline constexpr HRegister SRegister::H() const { return HRegister {Index}; } inline constexpr DRegister SRegister::D() const { return DRegister {Index}; } inline constexpr QRegister SRegister::Q() const { return QRegister {Index}; } inline constexpr ZRegister SRegister::Z() const { return ZRegister {Index}; } // DRegister inline constexpr DRegister DRegister::V() const { return DRegister {Index}; } inline constexpr BRegister DRegister::B() const { return BRegister {Index}; } inline constexpr HRegister DRegister::H() const { return HRegister {Index}; } inline constexpr SRegister DRegister::S() const { return SRegister {Index}; } inline constexpr QRegister DRegister::Q() const { return QRegister {Index}; } inline constexpr ZRegister DRegister::Z() const { return ZRegister {Index}; } // QRegister inline constexpr QRegister QRegister::V() const { return *this; } inline constexpr BRegister QRegister::B() const { return BRegister {Index}; } inline constexpr HRegister QRegister::H() const { return HRegister {Index}; } inline constexpr SRegister QRegister::S() const { return SRegister {Index}; } inline constexpr DRegister QRegister::D() const { return DRegister {Index}; } inline constexpr ZRegister QRegister::Z() const { return ZRegister {Index}; } // ZRegister inline constexpr VRegister ZRegister::V() const { return VRegister(Index); } inline constexpr BRegister ZRegister::B() const { return BRegister(Index); } inline constexpr HRegister ZRegister::H() const { return HRegister(Index); } inline constexpr SRegister ZRegister::S() const { return SRegister(Index); } inline constexpr DRegister ZRegister::D() const { return DRegister(Index); } inline constexpr QRegister ZRegister::Q() const { return QRegister(Index); } // Namespace containing all unsized ASIMD register objects. namespace VReg { constexpr static VRegister v0(0); constexpr static VRegister v1(1); constexpr static VRegister v2(2); constexpr static VRegister v3(3); constexpr static VRegister v4(4); constexpr static VRegister v5(5); constexpr static VRegister v6(6); constexpr static VRegister v7(7); constexpr static VRegister v8(8); constexpr static VRegister v9(9); constexpr static VRegister v10(10); constexpr static VRegister v11(11); constexpr static VRegister v12(12); constexpr static VRegister v13(13); constexpr static VRegister v14(14); constexpr static VRegister v15(15); constexpr static VRegister v16(16); constexpr static VRegister v17(17); constexpr static VRegister v18(18); constexpr static VRegister v19(19); constexpr static VRegister v20(20); constexpr static VRegister v21(21); constexpr static VRegister v22(22); constexpr static VRegister v23(23); constexpr static VRegister v24(24); constexpr static VRegister v25(25); constexpr static VRegister v26(26); constexpr static VRegister v27(27); constexpr static VRegister v28(28); constexpr static VRegister v29(29); constexpr static VRegister v30(30); constexpr static VRegister v31(31); } // namespace VReg // Namespace containing all 8-bit ASIMD register objects. namespace BReg { constexpr static BRegister b0(0); constexpr static BRegister b1(1); constexpr static BRegister b2(2); constexpr static BRegister b3(3); constexpr static BRegister b4(4); constexpr static BRegister b5(5); constexpr static BRegister b6(6); constexpr static BRegister b7(7); constexpr static BRegister b8(8); constexpr static BRegister b9(9); constexpr static BRegister b10(10); constexpr static BRegister b11(11); constexpr static BRegister b12(12); constexpr static BRegister b13(13); constexpr static BRegister b14(14); constexpr static BRegister b15(15); constexpr static BRegister b16(16); constexpr static BRegister b17(17); constexpr static BRegister b18(18); constexpr static BRegister b19(19); constexpr static BRegister b20(20); constexpr static BRegister b21(21); constexpr static BRegister b22(22); constexpr static BRegister b23(23); constexpr static BRegister b24(24); constexpr static BRegister b25(25); constexpr static BRegister b26(26); constexpr static BRegister b27(27); constexpr static BRegister b28(28); constexpr static BRegister b29(29); constexpr static BRegister b30(30); constexpr static BRegister b31(31); } // namespace BReg // Namespace containing all 16-bit ASIMD register objects. namespace HReg { constexpr static HRegister h0(0); constexpr static HRegister h1(1); constexpr static HRegister h2(2); constexpr static HRegister h3(3); constexpr static HRegister h4(4); constexpr static HRegister h5(5); constexpr static HRegister h6(6); constexpr static HRegister h7(7); constexpr static HRegister h8(8); constexpr static HRegister h9(9); constexpr static HRegister h10(10); constexpr static HRegister h11(11); constexpr static HRegister h12(12); constexpr static HRegister h13(13); constexpr static HRegister h14(14); constexpr static HRegister h15(15); constexpr static HRegister h16(16); constexpr static HRegister h17(17); constexpr static HRegister h18(18); constexpr static HRegister h19(19); constexpr static HRegister h20(20); constexpr static HRegister h21(21); constexpr static HRegister h22(22); constexpr static HRegister h23(23); constexpr static HRegister h24(24); constexpr static HRegister h25(25); constexpr static HRegister h26(26); constexpr static HRegister h27(27); constexpr static HRegister h28(28); constexpr static HRegister h29(29); constexpr static HRegister h30(30); constexpr static HRegister h31(31); } // namespace HReg // Namespace containing all 32-bit ASIMD register objects. namespace SReg { constexpr static SRegister s0(0); constexpr static SRegister s1(1); constexpr static SRegister s2(2); constexpr static SRegister s3(3); constexpr static SRegister s4(4); constexpr static SRegister s5(5); constexpr static SRegister s6(6); constexpr static SRegister s7(7); constexpr static SRegister s8(8); constexpr static SRegister s9(9); constexpr static SRegister s10(10); constexpr static SRegister s11(11); constexpr static SRegister s12(12); constexpr static SRegister s13(13); constexpr static SRegister s14(14); constexpr static SRegister s15(15); constexpr static SRegister s16(16); constexpr static SRegister s17(17); constexpr static SRegister s18(18); constexpr static SRegister s19(19); constexpr static SRegister s20(20); constexpr static SRegister s21(21); constexpr static SRegister s22(22); constexpr static SRegister s23(23); constexpr static SRegister s24(24); constexpr static SRegister s25(25); constexpr static SRegister s26(26); constexpr static SRegister s27(27); constexpr static SRegister s28(28); constexpr static SRegister s29(29); constexpr static SRegister s30(30); constexpr static SRegister s31(31); } // namespace SReg // Namespace containing all 64-bit ASIMD register objects. namespace DReg { constexpr static DRegister d0(0); constexpr static DRegister d1(1); constexpr static DRegister d2(2); constexpr static DRegister d3(3); constexpr static DRegister d4(4); constexpr static DRegister d5(5); constexpr static DRegister d6(6); constexpr static DRegister d7(7); constexpr static DRegister d8(8); constexpr static DRegister d9(9); constexpr static DRegister d10(10); constexpr static DRegister d11(11); constexpr static DRegister d12(12); constexpr static DRegister d13(13); constexpr static DRegister d14(14); constexpr static DRegister d15(15); constexpr static DRegister d16(16); constexpr static DRegister d17(17); constexpr static DRegister d18(18); constexpr static DRegister d19(19); constexpr static DRegister d20(20); constexpr static DRegister d21(21); constexpr static DRegister d22(22); constexpr static DRegister d23(23); constexpr static DRegister d24(24); constexpr static DRegister d25(25); constexpr static DRegister d26(26); constexpr static DRegister d27(27); constexpr static DRegister d28(28); constexpr static DRegister d29(29); constexpr static DRegister d30(30); constexpr static DRegister d31(31); } // namespace DReg // Namespace containing all 128-bit ASIMD register objects. namespace QReg { constexpr static QRegister q0(0); constexpr static QRegister q1(1); constexpr static QRegister q2(2); constexpr static QRegister q3(3); constexpr static QRegister q4(4); constexpr static QRegister q5(5); constexpr static QRegister q6(6); constexpr static QRegister q7(7); constexpr static QRegister q8(8); constexpr static QRegister q9(9); constexpr static QRegister q10(10); constexpr static QRegister q11(11); constexpr static QRegister q12(12); constexpr static QRegister q13(13); constexpr static QRegister q14(14); constexpr static QRegister q15(15); constexpr static QRegister q16(16); constexpr static QRegister q17(17); constexpr static QRegister q18(18); constexpr static QRegister q19(19); constexpr static QRegister q20(20); constexpr static QRegister q21(21); constexpr static QRegister q22(22); constexpr static QRegister q23(23); constexpr static QRegister q24(24); constexpr static QRegister q25(25); constexpr static QRegister q26(26); constexpr static QRegister q27(27); constexpr static QRegister q28(28); constexpr static QRegister q29(29); constexpr static QRegister q30(30); constexpr static QRegister q31(31); } // namespace QReg // Namespace containing all unsigned SVE register objects. namespace ZReg { constexpr static ZRegister z0(0); constexpr static ZRegister z1(1); constexpr static ZRegister z2(2); constexpr static ZRegister z3(3); constexpr static ZRegister z4(4); constexpr static ZRegister z5(5); constexpr static ZRegister z6(6); constexpr static ZRegister z7(7); constexpr static ZRegister z8(8); constexpr static ZRegister z9(9); constexpr static ZRegister z10(10); constexpr static ZRegister z11(11); constexpr static ZRegister z12(12); constexpr static ZRegister z13(13); constexpr static ZRegister z14(14); constexpr static ZRegister z15(15); constexpr static ZRegister z16(16); constexpr static ZRegister z17(17); constexpr static ZRegister z18(18); constexpr static ZRegister z19(19); constexpr static ZRegister z20(20); constexpr static ZRegister z21(21); constexpr static ZRegister z22(22); constexpr static ZRegister z23(23); constexpr static ZRegister z24(24); constexpr static ZRegister z25(25); constexpr static ZRegister z26(26); constexpr static ZRegister z27(27); constexpr static ZRegister z28(28); constexpr static ZRegister z29(29); constexpr static ZRegister z30(30); constexpr static ZRegister z31(31); } // namespace ZReg // Zero-cost FPR->GPR inline constexpr Register ToReg(HRegister Reg) { return Register(Reg.Idx()); } inline constexpr Register ToReg(SRegister Reg) { return Register(Reg.Idx()); } inline constexpr Register ToReg(DRegister Reg) { return Register(Reg.Idx()); } inline constexpr Register ToReg(VRegister Reg) { return Register(Reg.Idx()); } // Zero-cost GPR->FPR inline constexpr VRegister ToVReg(Register Reg) { return VRegister(Reg.Idx()); } inline constexpr VRegister ToVReg(XRegister Reg) { return VRegister(Reg.Idx()); } inline constexpr VRegister ToVReg(WRegister Reg) { return VRegister(Reg.Idx()); } class PRegisterZero; class PRegisterMerge; /* Unsized predicate register for SVE. * This is unsized because of how SVE operates. */ class PRegister { public: PRegister() = delete; constexpr PRegister(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const PRegister&, const PRegister&) = default; constexpr uint32_t Idx() const { return Index; } constexpr PRegisterZero Zeroing() const; constexpr PRegisterMerge Merging() const; private: uint32_t Index; }; static_assert(sizeof(PRegister) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); // Unsized predicate register for SVE with zeroing semantics. class PRegisterZero { public: PRegisterZero() = delete; constexpr PRegisterZero(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const PRegisterZero&, const PRegisterZero&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator PRegister() const { return PRegister(Index); } constexpr PRegister P() const { return PRegister(Index); } constexpr PRegisterMerge Merging() const; private: uint32_t Index; }; static_assert(sizeof(PRegisterZero) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); // Unsized predicate register for SVE with merging semantics. class PRegisterMerge { public: PRegisterMerge() = delete; constexpr PRegisterMerge(uint32_t Idx) : Index {Idx} {} friend constexpr auto operator<=>(const PRegisterMerge&, const PRegisterMerge&) = default; constexpr uint32_t Idx() const { return Index; } constexpr operator PRegister() const { return PRegister(Index); } constexpr PRegister P() const { return PRegister(Index); } constexpr PRegisterZero Zeroing() const; private: uint32_t Index; }; static_assert(sizeof(PRegisterMerge) == sizeof(uint32_t)); static_assert(std::is_trivially_copyable_v); static_assert(std::is_standard_layout_v); // PRegister inline constexpr PRegisterZero PRegister::Zeroing() const { return PRegisterZero(Idx()); } inline constexpr PRegisterMerge PRegister::Merging() const { return PRegisterMerge(Idx()); } // PRegisterZero inline constexpr PRegisterMerge PRegisterZero::Merging() const { return PRegisterMerge(Idx()); } // PRegisterMerge inline constexpr PRegisterZero PRegisterMerge::Zeroing() const { return PRegisterZero(Idx()); } // Namespace containing all unsigned SVE predicate register objects. namespace PReg { constexpr static PRegister p0(0); constexpr static PRegister p1(1); constexpr static PRegister p2(2); constexpr static PRegister p3(3); constexpr static PRegister p4(4); constexpr static PRegister p5(5); constexpr static PRegister p6(6); constexpr static PRegister p7(7); constexpr static PRegister p8(8); constexpr static PRegister p9(9); constexpr static PRegister p10(10); constexpr static PRegister p11(11); constexpr static PRegister p12(12); constexpr static PRegister p13(13); constexpr static PRegister p14(14); constexpr static PRegister p15(15); } // namespace PReg /* `OpType` enum describes how some SVE instructions operate if they support both forms. * Not all SVE instructions support this. */ enum class OpType : uint32_t { Destructive = 0, Constructive, }; } // namespace ARMEmitter ================================================ FILE: CodeEmitter/CodeEmitter/SVEOps.inl ================================================ // SPDX-License-Identifier: MIT /* SVE instruction emitters * These contain instruction emitters for AArch64 SVE and SVE2 operations. * * All of these SVE emitters have a `SubRegSize` as their first argument to set the element size on the instruction. * Since nearly every SVE instruction is unsized they don't need more than `ZRegister` and `PRegister` arguments. * * Most predicated instructions take a `PRegister` argument, not explicitly stating if it is merging or zeroing behaviour. * This is because the instruction only supports one style. * For instructions that take an explicit `PRegisterMerge` or `PRegisterZero`, then this instruction likely * supports both so we support both implementations depending on predicate register type. * * Some instructions take a templated `OpType` to choose between a destructive or constructive version of the instruction. * * Some instructions support the `i128Bit` SubRegSize, mostly around data movement. * * There are some SVE load-store helper functions which take a `SVEMemOperand` argument. * This helper will select the viable SVE load-store that can work with the provided encapsulated arguments. */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif public: // SVE encodings void dup(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Index) { SVEDupIndexed(size, zn, zd, Index); } void sel(SubRegSize size, ZRegister zd, PRegister pv, ZRegister zn, ZRegister zm) { SVESel(size, zm, pv, zn, zd); } void mov(SubRegSize size, ZRegister zd, PRegisterMerge pv, ZRegister zn) { sel(size, zd, pv, zn, zd); } void histcnt(SubRegSize size, ZRegister zd, PRegisterZero pv, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "SubRegSize must be 32-bit or 64-bit"); LOGMAN_THROW_A_FMT(pv <= PReg::p7.Zeroing(), "histcnt can only use p0 to p7"); uint32_t Op = 0b0100'0101'0010'0000'1100'0000'0000'0000; Op |= FEXCore::ToUnderlying(size) << 22; Op |= zm.Idx() << 16; Op |= pv.Idx() << 10; Op |= zn.Idx() << 5; Op |= zd.Idx(); dc32(Op); } void histseg(ZRegister zd, ZRegister zn, ZRegister zm) { uint32_t Op = 0b0100'0101'0010'0000'1010'0000'0000'0000; Op |= zm.Idx() << 16; Op |= zn.Idx() << 5; Op |= zd.Idx(); dc32(Op); } void fcmla(SubRegSize size, ZRegister zda, PRegisterMerge pv, ZRegister zn, ZRegister zm, Rotation rot) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); LOGMAN_THROW_A_FMT(pv <= PReg::p7.Merging(), "fcmla can only use p0 to p7"); uint32_t Op = 0b0110'0100'0000'0000'0000'0000'0000'0000; Op |= FEXCore::ToUnderlying(size) << 22; Op |= zm.Idx() << 16; Op |= FEXCore::ToUnderlying(rot) << 13; Op |= pv.Idx() << 10; Op |= zn.Idx() << 5; Op |= zda.Idx(); dc32(Op); } void fcadd(SubRegSize size, ZRegister zd, PRegisterMerge pv, ZRegister zn, ZRegister zm, Rotation rot) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); LOGMAN_THROW_A_FMT(pv <= PReg::p7.Merging(), "fcadd can only use p0 to p7"); LOGMAN_THROW_A_FMT(rot == Rotation::ROTATE_90 || rot == Rotation::ROTATE_270, "fcadd rotation may only be 90 or 270 degrees"); LOGMAN_THROW_A_FMT(zd == zn, "fcadd zd and zn must be the same register"); const uint32_t ConvertedRotation = rot == Rotation::ROTATE_90 ? 0 : 1; uint32_t Op = 0b0110'0100'0000'0000'1000'0000'0000'0000; Op |= FEXCore::ToUnderlying(size) << 22; Op |= ConvertedRotation << 16; Op |= pv.Idx() << 10; Op |= zm.Idx() << 5; Op |= zd.Idx(); dc32(Op); } // SVE integer add/subtract vectors (unpredicated) void add(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEIntegerAddSubUnpredicated(0b000, size, zm, zn, zd); } void sub(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEIntegerAddSubUnpredicated(0b001, size, zm, zn, zd); } void sqadd(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEIntegerAddSubUnpredicated(0b100, size, zm, zn, zd); } void uqadd(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEIntegerAddSubUnpredicated(0b101, size, zm, zn, zd); } void sqsub(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEIntegerAddSubUnpredicated(0b110, size, zm, zn, zd); } void uqsub(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEIntegerAddSubUnpredicated(0b111, size, zm, zn, zd); } // SVE address generation void adr(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm, SVEModType mod = SVEModType::MOD_NONE, uint32_t scale = 0) { SVEAddressGeneration(size, zd, zn, zm, mod, scale); } // SVE table lookup (three sources) void tbl(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVETableLookup(0b100, size, zm, zn, zd); } void tbl(SubRegSize size, ZRegister zd, ZRegister zn1, ZRegister zn2, ZRegister zm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zn1, zn2), "TBL zn1 and zn2 must be sequential"); SVETableLookup(0b010, size, zm, zn1, zd); } void tbx(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVETableLookup(0b011, size, zm, zn, zd); } // SVE permute vector elements void zip1(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEPermute(0b000, size, zm, zn, zd); } void zip2(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEPermute(0b001, size, zm, zn, zd); } void uzp1(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEPermute(0b010, size, zm, zn, zd); } void uzp2(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEPermute(0b011, size, zm, zn, zd); } void trn1(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEPermute(0b100, size, zm, zn, zd); } void trn2(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEPermute(0b101, size, zm, zn, zd); } // SVE integer compare with unsigned immediate void cmphi(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, uint32_t imm) { SVEIntegerCompareImm(0, 1, imm, size, pg, zn, pd); } void cmphs(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, uint32_t imm) { SVEIntegerCompareImm(0, 0, imm, size, pg, zn, pd); } void cmplo(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, uint32_t imm) { SVEIntegerCompareImm(1, 0, imm, size, pg, zn, pd); } void cmpls(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, uint32_t imm) { SVEIntegerCompareImm(1, 1, imm, size, pg, zn, pd); } // SVE integer compare with signed immediate void cmpeq(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, int32_t imm) { SVEIntegerCompareSignedImm(1, 0, 0, imm, size, pg, zn, pd); } void cmpgt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, int32_t imm) { SVEIntegerCompareSignedImm(0, 0, 1, imm, size, pg, zn, pd); } void cmpge(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, int32_t imm) { SVEIntegerCompareSignedImm(0, 0, 0, imm, size, pg, zn, pd); } void cmplt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, int32_t imm) { SVEIntegerCompareSignedImm(0, 1, 0, imm, size, pg, zn, pd); } void cmple(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, int32_t imm) { SVEIntegerCompareSignedImm(0, 1, 1, imm, size, pg, zn, pd); } void cmpne(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, int32_t imm) { SVEIntegerCompareSignedImm(1, 0, 1, imm, size, pg, zn, pd); } // SVE predicate logical operations void and_(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 0, 0, 0, pm, pg, pn, pd); } void ands(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 1, 0, 0, pm, pg, pn, pd); } void mov(PRegister pd, PRegisterMerge pg, PRegister pn) { SVEPredicateLogical(0, 0, 1, 1, pd, pg, pn, pd); } void mov(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPredicateLogical(0, 0, 0, 0, pn, pg, pn, pd); } void movs(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPredicateLogical(0, 1, 0, 0, pn, pg, pn, pd); } void bic(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 0, 0, 1, pm, pg, pn, pd); } void bics(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 1, 0, 1, pm, pg, pn, pd); } void eor(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 0, 1, 0, pm, pg, pn, pd); } void eors(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 1, 1, 0, pm, pg, pn, pd); } void not_(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPredicateLogical(0, 0, 1, 0, pg, pg, pn, pd); } void sel(PRegister pd, PRegister pg, PRegister pn, PRegister pm) { SVEPredicateLogical(0, 0, 1, 1, pm, pg, pn, pd); } void orr(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 0, 0, 0, pm, pg, pn, pd); } void mov(PRegister pd, PRegister pn) { SVEPredicateLogical(1, 0, 0, 0, pn, pn, pn, pd); } void orn(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 0, 0, 1, pm, pg, pn, pd); } void nor(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 0, 1, 0, pm, pg, pn, pd); } void nand(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 0, 1, 1, pm, pg, pn, pd); } void orrs(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 1, 0, 0, pm, pg, pn, pd); } void movs(PRegister pd, PRegister pn) { SVEPredicateLogical(1, 1, 0, 0, pn, pn, pn, pd); } void orns(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 1, 0, 1, pm, pg, pn, pd); } void nors(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 1, 1, 0, pm, pg, pn, pd); } void nands(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPredicateLogical(1, 1, 1, 1, pm, pg, pn, pd); } // SVE broadcast predicate element // XXX: // SVE integer clamp // XXX: // SVE2 character match void match(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVECharacterMatch(0, size, pd, pg, zn, zm); } void nmatch(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVECharacterMatch(1, size, pd, pg, zn, zm); } // SVE floating-point convert precision odd elements void fcvtxnt(ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatConvertOdd(0b00, 0b10, pg, zn, zd); } ///< Size is destination size void fcvtnt(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i16Bit, "Unsupported size in {}", __func__); const auto ConvertedDestSize = size == SubRegSize::i16Bit ? 0b00 : size == SubRegSize::i32Bit ? 0b10 : 0b00; const auto ConvertedSrcSize = size == SubRegSize::i16Bit ? 0b10 : size == SubRegSize::i32Bit ? 0b11 : 0b00; SVEFloatConvertOdd(ConvertedSrcSize, ConvertedDestSize, pg, zn, zd); } ///< Size is destination size void fcvtlt(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Unsupported size in {}", __func__); const auto ConvertedDestSize = size == SubRegSize::i32Bit ? 0b01 : size == SubRegSize::i64Bit ? 0b11 : 0b00; const auto ConvertedSrcSize = size == SubRegSize::i32Bit ? 0b10 : size == SubRegSize::i64Bit ? 0b11 : 0b00; SVEFloatConvertOdd(ConvertedSrcSize, ConvertedDestSize, pg, zn, zd); } // XXX: BFCVTNT // SVE2 floating-point pairwise operations void faddp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatPairwiseArithmetic(0b000, size, pg, zd, zn, zm); } void fmaxnmp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatPairwiseArithmetic(0b100, size, pg, zd, zn, zm); } void fminnmp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatPairwiseArithmetic(0b101, size, pg, zd, zn, zm); } void fmaxp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatPairwiseArithmetic(0b110, size, pg, zd, zn, zm); } void fminp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatPairwiseArithmetic(0b111, size, pg, zd, zn, zm); } // SVE floating-point multiply-add (indexed) void fmla(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddIndexed(0, size, zda, zn, zm, index); } void fmls(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddIndexed(1, size, zda, zn, zm, index); } // SVE floating-point complex multiply-add (indexed) void fcmla(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index, Rotation rot) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit, "SubRegSize must be 16-bit or 32-bit"); // 16 -> 32, 32 -> 64, since fcmla (indexed)'s restrictions and encodings // are essentially as if 16-bit were 32-bit and 32-bit were 64-bit. const auto DoubledSize = static_cast(FEXCore::ToUnderlying(size) + 1); SVEFPMultiplyAddIndexed(0b100 | FEXCore::ToUnderlying(rot), DoubledSize, zda, zn, zm, index); } // SVE floating-point multiply (indexed) void fmul(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddIndexed(0b1000, size, zd, zn, zm, index); } // SVE floating point matrix multiply accumulate // XXX: BFMMLA void fmmla(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMatrixMultiplyAccumulate(size, zda, zn, zm); } // SVE floating-point compare vectors void fcmeq(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(0, 1, 0, size, zm, pg, zn, pd); } void fcmgt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(0, 0, 1, size, zm, pg, zn, pd); } void fcmge(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(0, 0, 0, size, zm, pg, zn, pd); } void fcmne(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(0, 1, 1, size, zm, pg, zn, pd); } void fcmuo(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(1, 0, 0, size, zm, pg, zn, pd); } void facge(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(1, 0, 1, size, zm, pg, zn, pd); } void facgt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEFloatCompareVector(1, 1, 1, size, zm, pg, zn, pd); } void facle(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zm, ZRegister zn) { facge(size, pd, pg, zn, zm); } void faclt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zm, ZRegister zn) { facgt(size, pd, pg, zn, zm); } // SVE floating-point arithmetic (unpredicated) void fadd(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEFloatArithmeticUnpredicated(0b000, size, zm, zn, zd); } void fsub(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEFloatArithmeticUnpredicated(0b001, size, zm, zn, zd); } void fmul(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEFloatArithmeticUnpredicated(0b010, size, zm, zn, zd); } void ftsmul(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEFloatArithmeticUnpredicated(0b011, size, zm, zn, zd); } void frecps(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEFloatArithmeticUnpredicated(0b110, size, zm, zn, zd); } void frsqrts(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEFloatArithmeticUnpredicated(0b111, size, zm, zn, zd); } // SVE floating-point recursive reduction void faddv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEFPRecursiveReduction(0b000, size, vd, pg, zn); } void fmaxnmv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEFPRecursiveReduction(0b100, size, vd, pg, zn); } void fminnmv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEFPRecursiveReduction(0b101, size, vd, pg, zn); } void fmaxv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEFPRecursiveReduction(0b110, size, vd, pg, zn); } void fminv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEFPRecursiveReduction(0b111, size, vd, pg, zn); } // SVE integer Multiply-Add - Predicated // SVE integer multiply-accumulate writing addend (predicated) void mla(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMultiplyAddSubPredicated(0b0, 0b0, size, zda, pg, zn, zm); } void mls(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMultiplyAddSubPredicated(0b0, 0b1, size, zda, pg, zn, zm); } // SVE integer multiply-add writing multiplicand (predicated) void mad(SubRegSize size, ZRegister zdn, PRegisterMerge pg, ZRegister zm, ZRegister za) { SVEIntegerMultiplyAddSubPredicated(0b1, 0b0, size, zdn, pg, za, zm); } void msb(SubRegSize size, ZRegister zdn, PRegisterMerge pg, ZRegister zm, ZRegister za) { SVEIntegerMultiplyAddSubPredicated(0b1, 0b1, size, zdn, pg, za, zm); } // SVE Integer Binary Arithmetic - Predicated // SVE integer add/subtract vectors (predicated) void add(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEAddSubVectorsPredicated(0b000, size, zd, pg, zn, zm); } void sub(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEAddSubVectorsPredicated(0b001, size, zd, pg, zn, zm); } void subr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEAddSubVectorsPredicated(0b011, size, zd, pg, zn, zm); } // SVE integer min/max/difference (predicated) void smax(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEIntegerMinMaxDifferencePredicated(0b00, 0, size, pg, zdn, zm, zd); } void umax(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEIntegerMinMaxDifferencePredicated(0b00, 1, size, pg, zdn, zm, zd); } void smin(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEIntegerMinMaxDifferencePredicated(0b01, 0, size, pg, zdn, zm, zd); } void umin(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEIntegerMinMaxDifferencePredicated(0b01, 1, size, pg, zdn, zm, zd); } void sabd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEIntegerMinMaxDifferencePredicated(0b10, 0, size, pg, zdn, zm, zd); } void uabd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEIntegerMinMaxDifferencePredicated(0b10, 1, size, pg, zdn, zm, zd); } // SVE integer multiply vectors (predicated) void mul(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b0, 0b00, size, zd, pg, zn, zm); } void smulh(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b0, 0b10, size, zd, pg, zn, zm); } void umulh(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b0, 0b11, size, zd, pg, zn, zm); } // SVE integer divide vectors (predicated) void sdiv(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b1, 0b00, size, zd, pg, zn, zm); } void udiv(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b1, 0b01, size, zd, pg, zn, zm); } void sdivr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b1, 0b10, size, zd, pg, zn, zm); } void udivr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerMulDivVectorsPredicated(0b1, 0b11, size, zd, pg, zn, zm); } // SVE bitwise logical operations (predicated) void orr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEBitwiseLogicalPredicated(0b000, size, pg, zdn, zm, zd); } void eor(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEBitwiseLogicalPredicated(0b001, size, pg, zdn, zm, zd); } void and_(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEBitwiseLogicalPredicated(0b010, size, pg, zdn, zm, zd); } void bic(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, ZRegister zm) { SVEBitwiseLogicalPredicated(0b011, size, pg, zdn, zm, zd); } // SVE Integer Reduction // SVE integer add reduction (predicated) void saddv(SubRegSize size, DRegister vd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit, "saddv may only use 8-bit, " "16-bit, or 32-bit " "elements."); constexpr uint32_t Op = 0b0000'0100'0000'0000'0010'0000'0000'0000; SVEIntegerReductionOperation(Op, 0b00, size, vd, pg, zn); } void uaddv(SubRegSize size, DRegister vd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit || size == SubRegSize::i32Bit, "uaddv may only use 8-bit, " "16-bit, or 32-bit " "elements."); constexpr uint32_t Op = 0b0000'0100'0000'0000'0010'0000'0000'0000; SVEIntegerReductionOperation(Op, 0b01, size, vd, pg, zn); } // SVE integer min/max reduction (predicated) void smaxv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0000'1000'001 << 13; SVEIntegerReductionOperation(Op, 0b00, size, vd, pg, zn); } void umaxv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0000'1000'001 << 13; SVEIntegerReductionOperation(Op, 0b01, size, vd, pg, zn); } void sminv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0000'1000'001 << 13; SVEIntegerReductionOperation(Op, 0b10, size, vd, pg, zn); } void uminv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0000'1000'001 << 13; SVEIntegerReductionOperation(Op, 0b11, size, vd, pg, zn); } // SVE constructive prefix (predicated) template requires (std::is_same_v || std::is_same_v) void movprfx(SubRegSize size, ZRegister zd, T pg, ZRegister zn) { constexpr uint32_t M = std::is_same_v ? 1 : 0; SVEConstructivePrefixPredicated(0b00, M, size, pg, zn, zd); } // SVE bitwise logical reduction (predicated) void orv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0001'1000'0010'0000'0000'0000; SVEIntegerReductionOperation(Op, 0b00, size, vd, pg, zn); } void eorv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0001'1000'0010'0000'0000'0000; SVEIntegerReductionOperation(Op, 0b01, size, vd, pg, zn); } void andv(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { constexpr uint32_t Op = 0b0000'0100'0001'1000'0010'0000'0000'0000; SVEIntegerReductionOperation(Op, 0b10, size, vd, pg, zn); } // SVE Bitwise Shift - Predicated // SVE bitwise shift by immediate (predicated) void asr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b00, 0, 0, pg, zd, zdn, Shift); } void lsr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b00, 0, 1, pg, zd, zdn, Shift); } void lsl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b00, 1, 1, pg, zd, zdn, Shift); } void asrd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b01, 0, 0, pg, zd, zdn, Shift); } void sqshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b01, 1, 0, pg, zd, zdn, Shift); } void uqshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b01, 1, 1, pg, zd, zdn, Shift); } void srshr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b11, 0, 0, pg, zd, zdn, Shift); } void urshr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b11, 0, 1, pg, zd, zdn, Shift); } void sqshlu(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zdn, uint32_t Shift) { SVEBitWiseShiftImmediatePred(size, 0b11, 1, 1, pg, zd, zdn, Shift); } // SVE bitwise shift by vector (predicated) void asr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftbyVector(0, 0, 0, size, pg, zd, zn, zm); } void lsr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftbyVector(0, 0, 1, size, pg, zd, zn, zm); } void lsl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftbyVector(0, 1, 1, size, pg, zd, zn, zm); } void asrr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftbyVector(1, 0, 0, size, pg, zd, zn, zm); } void lsrr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftbyVector(1, 0, 1, size, pg, zd, zn, zm); } void lslr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftbyVector(1, 1, 1, size, pg, zd, zn, zm); } // SVE bitwise shift by wide elements (predicated) void asr_wide(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftByWideElementPredicated(size, 0b000, zd, pg, zn, zm); } void lsr_wide(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftByWideElementPredicated(size, 0b001, zd, pg, zn, zm); } void lsl_wide(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEBitwiseShiftByWideElementPredicated(size, 0b011, zd, pg, zn, zm); } // SVE Integer Unary Arithmetic - Predicated // SVE integer unary operations (predicated) void sxtb(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEIntegerUnaryPredicated(0b10, 0b000, size, pg, zn, zd); } void uxtb(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEIntegerUnaryPredicated(0b10, 0b001, size, pg, zn, zd); } void sxth(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEIntegerUnaryPredicated(0b10, 0b010, size, pg, zn, zd); } void uxth(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEIntegerUnaryPredicated(0b10, 0b011, size, pg, zn, zd); } void sxtw(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEIntegerUnaryPredicated(0b10, 0b100, size, pg, zn, zd); } void uxtw(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEIntegerUnaryPredicated(0b10, 0b101, size, pg, zn, zd); } void abs(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b10, 0b110, size, pg, zn, zd); } void neg(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b10, 0b111, size, pg, zn, zd); } // SVE bitwise unary operations (predicated) void cls(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b11, 0b000, size, pg, zn, zd); } void clz(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b11, 0b001, size, pg, zn, zd); } void cnt(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b11, 0b010, size, pg, zn, zd); } void cnot(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b11, 0b011, size, pg, zn, zd); } void fabs(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); SVEIntegerUnaryPredicated(0b11, 0b100, size, pg, zn, zd); } void fneg(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Invalid size"); SVEIntegerUnaryPredicated(0b11, 0b101, size, pg, zn, zd); } void not_(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEIntegerUnaryPredicated(0b11, 0b110, size, pg, zn, zd); } // SVE Bitwise Logical - Unpredicated // SVE bitwise logical operations (unpredicated) void and_(ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseLogicalUnpredicated(0b00, zm, zn, zd); } void orr(ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseLogicalUnpredicated(0b01, zm, zn, zd); } void mov(ZRegister zd, ZRegister zn) { SVEBitwiseLogicalUnpredicated(0b01, zn, zn, zd); } void eor(ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseLogicalUnpredicated(0b10, zm, zn, zd); } void bic(ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseLogicalUnpredicated(0b11, zm, zn, zd); } void xar(SubRegSize size, ZRegister zd, ZRegister zm, uint32_t rotate) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Element size cannot be 128-bit."); const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, rotate); uint32_t Inst = 0b0000'0100'0010'0000'0011'0100'0000'0000; Inst |= tszh << 22; Inst |= tszl_imm3 << 16; Inst |= zm.Idx() << 5; Inst |= zd.Idx(); dc32(Inst); } // SVE2 bitwise ternary operations void eor3(ZRegister zd, ZRegister zdn, ZRegister zm, ZRegister zk) { SVE2BitwiseTernary(0b00, 0, zm, zk, zd, zdn); } void bsl(ZRegister zd, ZRegister zdn, ZRegister zm, ZRegister zk) { SVE2BitwiseTernary(0b00, 1, zm, zk, zd, zdn); } void bcax(ZRegister zd, ZRegister zdn, ZRegister zm, ZRegister zk) { SVE2BitwiseTernary(0b01, 0, zm, zk, zd, zdn); } void bsl1n(ZRegister zd, ZRegister zdn, ZRegister zm, ZRegister zk) { SVE2BitwiseTernary(0b01, 1, zm, zk, zd, zdn); } void bsl2n(ZRegister zd, ZRegister zdn, ZRegister zm, ZRegister zk) { SVE2BitwiseTernary(0b10, 1, zm, zk, zd, zdn); } void nbsl(ZRegister zd, ZRegister zdn, ZRegister zm, ZRegister zk) { SVE2BitwiseTernary(0b11, 1, zm, zk, zd, zdn); } // SVE Index Generation void index(SubRegSize size, ZRegister zd, int32_t initial, int32_t increment) { LOGMAN_THROW_A_FMT(initial >= -16 && initial <= 15, "initial value must be within -16-15. initial: {}", initial); LOGMAN_THROW_A_FMT(increment >= -16 && increment <= 15, "increment value must be within -16-15. increment: {}", increment); SVEIndexGeneration(0b00, size, zd, initial, increment); } void index(SubRegSize size, ZRegister zd, Register initial, int32_t increment) { LOGMAN_THROW_A_FMT(increment >= -16 && increment <= 15, "increment value must be within -16-15. increment: {}", increment); SVEIndexGeneration(0b01, size, zd, static_cast(initial.Idx()), increment); } void index(SubRegSize size, ZRegister zd, int32_t initial, Register increment) { LOGMAN_THROW_A_FMT(initial >= -16 && initial <= 15, "initial value must be within -16-15. initial: {}", initial); SVEIndexGeneration(0b10, size, zd, initial, static_cast(increment.Idx())); } void index(SubRegSize size, ZRegister zd, Register initial, Register increment) { SVEIndexGeneration(0b11, size, zd, static_cast(initial.Idx()), static_cast(increment.Idx())); } // SVE Stack Allocation // SVE stack frame adjustment void addvl(XRegister rd, XRegister rn, int32_t imm) { SVEStackFrameOperation(0b00, rd, rn, imm); } void addpl(XRegister rd, XRegister rn, int32_t imm) { SVEStackFrameOperation(0b01, rd, rn, imm); } // Streaming SVE stack frame adjustment (SME) // XXX: // SVE stack frame size void rdvl(XRegister rd, int32_t imm) { // Would-be Rn field is just set to all 1's, which is the same // as writing the encoding for the SP into it. SVEStackFrameOperation(0b10, rd, XReg::rsp, imm); } // Streaming SVE stack frame size (SME) // XXX: // SVE2 Integer Multiply - Unpredicated // SVE2 integer multiply vectors (unpredicated) void mul(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyVectors(0b00, size, zm, zn, zd); } void smulh(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyVectors(0b10, size, zm, zn, zd); } void umulh(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyVectors(0b11, size, zm, zn, zd); } void pmul(ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyVectors(0b01, SubRegSize::i8Bit, zm, zn, zd); } // SVE2 signed saturating doubling multiply high (unpredicated) void sqdmulh(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyVectors(0b100, size, zm, zn, zd); } void sqrdmulh(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyVectors(0b101, size, zm, zn, zd); } // SVE Bitwise Shift - Unpredicated // SVE bitwise shift by wide elements (unpredicated) void asr_wide(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseShiftByWideElementsUnpredicated(size, 0b00, zd, zn, zm); } void lsr_wide(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseShiftByWideElementsUnpredicated(size, 0b01, zd, zn, zm); } void lsl_wide(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVEBitwiseShiftByWideElementsUnpredicated(size, 0b11, zd, zn, zm); } // SVE bitwise shift by immediate (unpredicated) void asr(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVEBitWiseShiftImmediateUnpred(size, 0b00, zd, zn, shift); } void lsr(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVEBitWiseShiftImmediateUnpred(size, 0b01, zd, zn, shift); } void lsl(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVEBitWiseShiftImmediateUnpred(size, 0b11, zd, zn, shift); } // SVE Integer Misc - Unpredicated // SVE floating-point trig select coefficient void ftssel(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "ftssel may only use 16/32/64-bit element sizes"); SVEIntegerMiscUnpredicated(0b00, zm.Idx(), FEXCore::ToUnderlying(size), zd, zn); } // SVE floating-point exponential accelerator void fexpa(SubRegSize size, ZRegister zd, ZRegister zn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "fexpa may only use 16/32/64-bit element sizes"); SVEIntegerMiscUnpredicated(0b10, 0b00000, FEXCore::ToUnderlying(size), zd, zn); } // SVE constructive prefix (unpredicated) void movprfx(ZRegister zd, ZRegister zn) { SVEIntegerMiscUnpredicated(0b11, 0b00000, 0b00, zd, zn); } // SVE Element Count // SVE saturating inc/dec vector by element count void sqinch(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0000, SubRegSize::i16Bit, zdn, pattern, imm4); } void uqinch(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0001, SubRegSize::i16Bit, zdn, pattern, imm4); } void sqdech(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0010, SubRegSize::i16Bit, zdn, pattern, imm4); } void uqdech(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0011, SubRegSize::i16Bit, zdn, pattern, imm4); } void sqincw(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0000, SubRegSize::i32Bit, zdn, pattern, imm4); } void uqincw(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0001, SubRegSize::i32Bit, zdn, pattern, imm4); } void sqdecw(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0010, SubRegSize::i32Bit, zdn, pattern, imm4); } void uqdecw(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0011, SubRegSize::i32Bit, zdn, pattern, imm4); } void sqincd(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0000, SubRegSize::i64Bit, zdn, pattern, imm4); } void uqincd(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0001, SubRegSize::i64Bit, zdn, pattern, imm4); } void sqdecd(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0010, SubRegSize::i64Bit, zdn, pattern, imm4); } void uqdecd(ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { SVEElementCount(0, 0b0011, SubRegSize::i64Bit, zdn, pattern, imm4); } // SVE element count void cntb(XRegister rd, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1000, SubRegSize::i8Bit, ZRegister {rd.Idx()}, pattern, imm); } void cnth(XRegister rd, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1000, SubRegSize::i16Bit, ZRegister {rd.Idx()}, pattern, imm); } void cntw(XRegister rd, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1000, SubRegSize::i32Bit, ZRegister {rd.Idx()}, pattern, imm); } void cntd(XRegister rd, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1000, SubRegSize::i64Bit, ZRegister {rd.Idx()}, pattern, imm); } // SVE inc/dec vector by element count void inch(ZRegister zdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b0000, SubRegSize::i16Bit, zdn, pattern, imm); } void dech(ZRegister zdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b0001, SubRegSize::i16Bit, zdn, pattern, imm); } void incw(ZRegister zdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b0000, SubRegSize::i32Bit, zdn, pattern, imm); } void decw(ZRegister zdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b0001, SubRegSize::i32Bit, zdn, pattern, imm); } void incd(ZRegister zdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b0000, SubRegSize::i64Bit, zdn, pattern, imm); } void decd(ZRegister zdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b0001, SubRegSize::i64Bit, zdn, pattern, imm); } // SVE inc/dec register by element count void incb(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1000, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void decb(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1001, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void inch(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1000, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void dech(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1001, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void incw(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1000, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void decw(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1001, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void incd(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1000, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void decd(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1001, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } // SVE saturating inc/dec register by element count void sqincb(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1100, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqincb(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1100, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqincb(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1101, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqincb(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1101, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdecb(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1110, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdecb(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1110, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdecb(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1111, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdecb(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1111, SubRegSize::i8Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqinch(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1100, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqinch(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1100, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqinch(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1101, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqinch(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1101, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdech(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1110, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdech(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1110, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdech(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1111, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdech(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1111, SubRegSize::i16Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqincw(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1100, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqincw(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1100, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqincw(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1101, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqincw(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1101, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdecw(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1110, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdecw(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1110, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdecw(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1111, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdecw(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1111, SubRegSize::i32Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqincd(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1100, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqincd(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1100, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqincd(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1101, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqincd(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1101, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdecd(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1110, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void sqdecd(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1110, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdecd(XRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(1, 0b1111, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } void uqdecd(WRegister rdn, PredicatePattern pattern, uint32_t imm) { SVEElementCount(0, 0b1111, SubRegSize::i64Bit, ZRegister {rdn.Idx()}, pattern, imm); } // SVE Bitwise Immediate // XXX: DUPM // SVE bitwise logical with immediate (unpredicated) // XXX: // SVE Integer Wide Immediate - Predicated void fcpy(SubRegSize size, ZRegister zd, PRegisterMerge pg, float value) { SVEBroadcastFloatImmPredicated(size, zd, pg, value); } void fmov(SubRegSize size, ZRegister zd, PRegisterMerge pg, float value) { fcpy(size, zd, pg, value); } // SVE copy integer immediate (predicated) void cpy(SubRegSize size, ZRegister zd, PRegisterZero pg, int32_t imm) { SVEBroadcastIntegerImmPredicated(0, size, zd, pg, imm); } void cpy(SubRegSize size, ZRegister zd, PRegisterMerge pg, int32_t imm) { SVEBroadcastIntegerImmPredicated(1, size, zd, pg, imm); } void mov_imm(SubRegSize size, ZRegister zd, PRegisterZero pg, int32_t imm) { cpy(size, zd, pg, imm); } void mov_imm(SubRegSize size, ZRegister zd, PRegisterMerge pg, int32_t imm) { cpy(size, zd, pg, imm); } // SVE Permute Vector - Unpredicated void dup(SubRegSize size, ZRegister zd, Register rn) { SVEPermuteUnpredicated(size, 0b00000, zd, ZRegister {rn.Idx()}); } void mov(SubRegSize size, ZRegister zd, Register rn) { dup(size, zd, rn); } void insr(SubRegSize size, ZRegister zdn, Register rm) { SVEPermuteUnpredicated(size, 0b00100, zdn, ZRegister {rm.Idx()}); } void insr(SubRegSize size, ZRegister zdn, VRegister vm) { SVEPermuteUnpredicated(size, 0b10100, zdn, vm.Z()); } void rev(SubRegSize size, ZRegister zd, ZRegister zn) { SVEPermuteUnpredicated(size, 0b11000, zd, zn); } // SVE unpack vector elements void sunpklo(SubRegSize size, ZRegister zd, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEPermuteUnpredicated(size, 0b10000, zd, zn); } void sunpkhi(SubRegSize size, ZRegister zd, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEPermuteUnpredicated(size, 0b10001, zd, zn); } void uunpklo(SubRegSize size, ZRegister zd, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEPermuteUnpredicated(size, 0b10010, zd, zn); } void uunpkhi(SubRegSize size, ZRegister zd, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid subregsize size"); SVEPermuteUnpredicated(size, 0b10011, zd, zn); } // SVE Permute Predicate void rev(SubRegSize size, PRegister pd, PRegister pn) { SVEPermutePredicate(size, 0b10100, 0b0000, 0b0, pd, pn); } // SVE unpack predicate elements void punpklo(PRegister pd, PRegister pn) { SVEPermutePredicate(SubRegSize::i8Bit, 0b10000, 0b0000, 0b0, pd, pn); } void punpkhi(PRegister pd, PRegister pn) { SVEPermutePredicate(SubRegSize::i8Bit, 0b10001, 0b0000, 0b0, pd, pn); } // SVE permute predicate elements void zip1(SubRegSize size, PRegister pd, PRegister pn, PRegister pm) { SVEPermutePredicate(size, pm.Idx(), 0b0000, 0b0, pd, pn); } void zip2(SubRegSize size, PRegister pd, PRegister pn, PRegister pm) { SVEPermutePredicate(size, pm.Idx(), 0b0010, 0b0, pd, pn); } void uzp1(SubRegSize size, PRegister pd, PRegister pn, PRegister pm) { SVEPermutePredicate(size, pm.Idx(), 0b0100, 0b0, pd, pn); } void uzp2(SubRegSize size, PRegister pd, PRegister pn, PRegister pm) { SVEPermutePredicate(size, pm.Idx(), 0b0110, 0b0, pd, pn); } void trn1(SubRegSize size, PRegister pd, PRegister pn, PRegister pm) { SVEPermutePredicate(size, pm.Idx(), 0b1000, 0b0, pd, pn); } void trn2(SubRegSize size, PRegister pd, PRegister pn, PRegister pm) { SVEPermutePredicate(size, pm.Idx(), 0b1010, 0b0, pd, pn); } // SVE Permute Vector - Predicated - Base // CPY (SIMD&FP scalar) void cpy(SubRegSize size, ZRegister zd, PRegisterMerge pg, VRegister vn) { SVEPermuteVectorPredicated(0b00000, 0b0, size, zd, pg, ZRegister {vn.Idx()}); } void compact(SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit || size == SubRegSize::i32Bit, "Invalid element size"); SVEPermuteVectorPredicated(0b00001, 0b0, size, zd, pg, zn); } // CPY (scalar) void cpy(SubRegSize size, ZRegister zd, PRegisterMerge pg, Register rn) { SVEPermuteVectorPredicated(0b01000, 0b1, size, zd, pg, ZRegister {rn.Idx()}); } template requires (optype == OpType::Constructive) void splice(SubRegSize size, ZRegister zd, PRegister pv, ZRegister zn, ZRegister zn2) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zn, zn2), "zn and zn2 must be sequential registers"); SVEPermuteVectorPredicated(0b01101, 0b0, size, zd, pv, zn); } template requires (optype == OpType::Destructive) void splice(SubRegSize size, ZRegister zd, PRegister pv, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zd needs to equal zn"); SVEPermuteVectorPredicated(0b01100, 0b0, size, zd, pv, zm); } // SVE Permute Vector - Predicated // SVE extract element to general register void lasta(SubRegSize size, Register rd, PRegister pg, ZRegister zn) { SVEPermuteVectorPredicated(0b00000, 0b1, size, ZRegister {rd.Idx()}, pg, zn); } void lastb(SubRegSize size, Register rd, PRegister pg, ZRegister zn) { SVEPermuteVectorPredicated(0b00001, 0b1, size, ZRegister {rd.Idx()}, pg, zn); } // SVE extract element to SIMD&FP scalar register void lasta(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEPermuteVectorPredicated(0b00010, 0b0, size, ZRegister {vd.Idx()}, pg, zn); } void lastb(SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { SVEPermuteVectorPredicated(0b00011, 0b0, size, ZRegister {vd.Idx()}, pg, zn); } // SVE reverse within elements void revb(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Can't use 8-bit element size"); SVEPermuteVectorPredicated(0b00100, 0b0, size, zd, pg, zn); } void revh(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i16Bit, "Can't use 8/16-bit element sizes"); SVEPermuteVectorPredicated(0b00101, 0b0, size, zd, pg, zn); } void revw(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit, "Can't use 8/16/32-bit element sizes"); SVEPermuteVectorPredicated(0b00110, 0b0, size, zd, pg, zn); } void rbit(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEPermuteVectorPredicated(0b00111, 0b0, size, zd, pg, zn); } // SVE conditionally broadcast element to vector void clasta(SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zd must be the same as zn"); SVEPermuteVectorPredicated(0b01000, 0b0, size, zd, pg, zm); } void clastb(SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zd must be the same as zn"); SVEPermuteVectorPredicated(0b01001, 0b0, size, zd, pg, zm); } // SVE conditionally extract element to SIMD&FP scalar void clasta(SubRegSize size, VRegister vd, PRegister pg, VRegister vn, ZRegister zm) { LOGMAN_THROW_A_FMT(vd == vn, "vd must be the same as vn"); SVEPermuteVectorPredicated(0b01010, 0b0, size, ZRegister {vd.Idx()}, pg, zm); } void clastb(SubRegSize size, VRegister vd, PRegister pg, VRegister vn, ZRegister zm) { LOGMAN_THROW_A_FMT(vd == vn, "vd must be the same as vn"); SVEPermuteVectorPredicated(0b01011, 0b0, size, ZRegister {vd.Idx()}, pg, zm); } // SVE reverse doublewords (SME) // XXX: // SVE conditionally extract element to general register void clasta(SubRegSize size, Register rd, PRegister pg, Register rn, ZRegister zm) { LOGMAN_THROW_A_FMT(rd == rn, "rd must be the same as rn"); SVEPermuteVectorPredicated(0b10000, 0b1, size, ZRegister {rd.Idx()}, pg, zm); } void clastb(SubRegSize size, Register rd, PRegister pg, Register rn, ZRegister zm) { LOGMAN_THROW_A_FMT(rd == rn, "rd must be the same as rn"); SVEPermuteVectorPredicated(0b10001, 0b1, size, ZRegister {rd.Idx()}, pg, zm); } // SVE Permute Vector - Extract // Constructive template requires (optype == OpType::Constructive) void ext(ZRegister zd, ZRegister zn, ZRegister zn2, uint8_t Imm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zn, zn2), "zn and zn2 must be sequential registers"); SVEPermuteVector(1, zd, zn, Imm); } // Destructive template requires (optype == OpType::Destructive) void ext(ZRegister zd, ZRegister zdn, ZRegister zm, uint8_t Imm) { LOGMAN_THROW_A_FMT(zd == zdn, "Dest needs to equal zdn"); SVEPermuteVector(0, zd, zm, Imm); } // SVE Permute Vector - Segments // SVE permute vector segments // XXX: // SVE Integer Compare - Vectors // SVE integer compare vectors void cmpeq(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVector(1, 1, 0, size, zm, pg, zn, pd); } void cmpge(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVector(1, 0, 0, size, zm, pg, zn, pd); } void cmpgt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVector(1, 0, 1, size, zm, pg, zn, pd); } void cmphi(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVector(0, 0, 1, size, zm, pg, zn, pd); } void cmphs(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVector(0, 0, 0, size, zm, pg, zn, pd); } void cmpne(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVector(1, 1, 1, size, zm, pg, zn, pd); } // SVE integer compare with wide elements void cmpeq_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(0, 0b01, 0, size, pd, pg, zn, zm); } void cmpgt_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(0, 0b10, 1, size, pd, pg, zn, zm); } void cmpge_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(0, 0b10, 0, size, pd, pg, zn, zm); } void cmphi_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(1, 0b10, 1, size, pd, pg, zn, zm); } void cmphs_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(1, 0b10, 0, size, pd, pg, zn, zm); } void cmplt_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(0, 0b11, 0, size, pd, pg, zn, zm); } void cmple_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(0, 0b11, 1, size, pd, pg, zn, zm); } void cmplo_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(1, 0b11, 0, size, pd, pg, zn, zm); } void cmpls_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(1, 0b11, 1, size, pd, pg, zn, zm); } void cmpne_wide(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { SVEIntegerCompareVectorWide(0, 0b01, 1, size, pd, pg, zn, zm); } // SVE Propagate Break // SVE propagate break from previous partition void brkpa(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPropagateBreak(0b0000, 0b11, 0, pd, pg, pn, pm); } void brkpb(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPropagateBreak(0b0000, 0b11, 1, pd, pg, pn, pm); } void brkpas(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPropagateBreak(0b0100, 0b11, 0, pd, pg, pn, pm); } void brkpbs(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { SVEPropagateBreak(0b0100, 0b11, 1, pd, pg, pn, pm); } // SVE Partition Break // SVE propagate break to next partition void brkn(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { LOGMAN_THROW_A_FMT(pd == pm, "pd and pm need to be the same"); SVEPropagateBreak(0b0001, 0b01, 0, pd, pg, pn, PReg::p8); } void brkns(PRegister pd, PRegisterZero pg, PRegister pn, PRegister pm) { LOGMAN_THROW_A_FMT(pd == pm, "pd and pm need to be the same"); SVEPropagateBreak(0b0101, 0b01, 0, pd, pg, pn, PReg::p8); } // SVE partition break condition void brka(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPropagateBreak(0b0001, 0b01, 0, pd, pg, pn, PReg::p0); } void brka(PRegister pd, PRegisterMerge pg, PRegister pn) { SVEPropagateBreak(0b0001, 0b01, 1, pd, pg, pn, PReg::p0); } void brkas(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPropagateBreak(0b0101, 0b01, 0, pd, pg, pn, PReg::p0); } void brkb(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPropagateBreak(0b1001, 0b01, 0, pd, pg, pn, PReg::p0); } void brkb(PRegister pd, PRegisterMerge pg, PRegister pn) { SVEPropagateBreak(0b1001, 0b01, 1, pd, pg, pn, PReg::p0); } void brkbs(PRegister pd, PRegisterZero pg, PRegister pn) { SVEPropagateBreak(0b1101, 0b01, 0, pd, pg, pn, PReg::p0); } // SVE Predicate Misc void pnext(SubRegSize size, PRegister pd, PRegister pv, PRegister pn) { LOGMAN_THROW_A_FMT(pd == pn, "pd and pn need to be the same"); SVEPredicateMisc(0b1001, 0b00010, pv.Idx(), size, pd); } // SVE predicate test void ptest(PRegister pg, PRegister pn) { SVEPredicateMisc(0b0000, pg.Idx() << 1, pn.Idx(), SubRegSize::i16Bit, PReg::p0); } // SVE predicate first active void pfirst(PRegister pd, PRegister pg, PRegister pn) { LOGMAN_THROW_A_FMT(pd == pn, "pd and pn need to be the same"); SVEPredicateMisc(0b1000, 0b00000, pg.Idx(), SubRegSize::i16Bit, pd); } // SVE predicate zero void pfalse(PRegister pd) { SVEPredicateMisc(0b1000, 0b10010, 0b0000, SubRegSize::i8Bit, pd); } // SVE predicate read from FFR (predicated) void rdffr(PRegister pd, PRegisterZero pg) { SVEPredicateMisc(0b1000, 0b11000, pg.Idx(), SubRegSize::i8Bit, pd); } void rdffrs(PRegister pd, PRegisterZero pg) { SVEPredicateMisc(0b1000, 0b11000, pg.Idx(), SubRegSize::i16Bit, pd); } // SVE predicate read from FFR (unpredicated) void rdffr(PRegister pd) { SVEPredicateMisc(0b1001, 0b11000, 0b0000, SubRegSize::i8Bit, pd); } // SVE predicate initialize void ptrue(SubRegSize size, PRegister pd, PredicatePattern pattern) { SVEPredicateMisc(0b1000, 0b10000, FEXCore::ToUnderlying(pattern), size, pd); } void ptrues(SubRegSize size, PRegister pd, PredicatePattern pattern) { SVEPredicateMisc(0b1001, 0b10000, FEXCore::ToUnderlying(pattern), size, pd); } // SVE Integer Compare - Scalars // SVE integer compare scalar count and limit template void whilege(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar(IsXRegister << 2, 0, pd.Idx(), size, rn, rm); } template void whilegt(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar(IsXRegister << 2, 1, pd.Idx(), size, rn, rm); } template void whilelt(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar((IsXRegister << 2) | 0b001, 0, pd.Idx(), size, rn, rm); } template void whilele(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar((IsXRegister << 2) | 0b001, 1, pd.Idx(), size, rn, rm); } template void whilehs(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar((IsXRegister << 2) | 0b010, 0, pd.Idx(), size, rn, rm); } template void whilehi(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar((IsXRegister << 2) | 0b010, 1, pd.Idx(), size, rn, rm); } template void whilelo(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar((IsXRegister << 2) | 0b011, 0, pd.Idx(), size, rn, rm); } template void whilels(SubRegSize size, PRegister pd, T rn, T rm) { constexpr auto IsXRegister = static_cast(std::is_same_v); SVEIntCompareScalar((IsXRegister << 2) | 0b011, 1, pd.Idx(), size, rn, rm); } // SVE conditionally terminate scalars template void ctermeq(T rn, T rm) { constexpr auto size = std::is_same_v ? SubRegSize::i64Bit : SubRegSize::i32Bit; SVEIntCompareScalar(0b1000, 0, 0b0000, size, rn, rm); } template void ctermne(T rn, T rm) { constexpr auto size = std::is_same_v ? SubRegSize::i64Bit : SubRegSize::i32Bit; SVEIntCompareScalar(0b1000, 1, 0b0000, size, rn, rm); } // SVE pointer conflict compare void whilewr(SubRegSize size, PRegister pd, XRegister rn, XRegister rm) { SVEIntCompareScalar(0b1100, 0, pd.Idx(), size, rn, rm); } void whilerw(SubRegSize size, PRegister pd, XRegister rn, XRegister rm) { SVEIntCompareScalar(0b1100, 1, pd.Idx(), size, rn, rm); } // SVE Integer Wide Immediate - Unpredicated // SVE integer add/subtract immediate (unpredicated) void add(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b000, size, zd, zn, imm); } void sub(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b001, size, zd, zn, imm); } void subr(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b011, size, zd, zn, imm); } void sqadd(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b100, size, zd, zn, imm); } void uqadd(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b101, size, zd, zn, imm); } void sqsub(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b110, size, zd, zn, imm); } void uqsub(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { SVEAddSubImmediateUnpred(0b111, size, zd, zn, imm); } // SVE integer min/max immediate (unpredicated) void smax(SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { SVEMinMaxImmediateUnpred(0b000, size, zd, zn, imm); } void umax(SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { SVEMinMaxImmediateUnpred(0b001, size, zd, zn, imm); } void smin(SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { SVEMinMaxImmediateUnpred(0b010, size, zd, zn, imm); } void umin(SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { SVEMinMaxImmediateUnpred(0b011, size, zd, zn, imm); } // SVE integer multiply immediate (unpredicated) void mul(SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { SVEMultiplyImmediateUnpred(0b000, size, zd, zn, imm); } // SVE broadcast integer immediate (unpredicated) void dup_imm(SubRegSize size, ZRegister zd, int32_t Value) { SVEBroadcastImm(0b00, Value, size, zd); } void mov_imm(SubRegSize size, ZRegister zd, int32_t Value) { dup_imm(size, zd, Value); } // SVE broadcast floating-point immediate (unpredicated) void fdup(SubRegSize size, ZRegister zd, float Value) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Unsupported fmov size"); uint32_t Imm {}; if (size == SubRegSize::i16Bit) { LOGMAN_MSG_A_FMT("Unsupported"); FEX_UNREACHABLE; } else if (size == SubRegSize::i32Bit) { Imm = FP32ToImm8(Value); } else if (size == SubRegSize::i64Bit) { Imm = FP64ToImm8(Value); } SVEBroadcastFloatImmUnpredicated(0b00, 0, Imm, size, zd); } void fmov(SubRegSize size, ZRegister zd, float Value) { fdup(size, zd, Value); } // SVE Predicate Count // SVE predicate count void cntp(SubRegSize size, XRegister rd, PRegister pg, PRegister pn) { SVEPredicateCount(0b000, size, rd, pg, pn); } // SVE Inc/Dec by Predicate Count // SVE saturating inc/dec vector by predicate count void sqincp(SubRegSize size, ZRegister zdn, PRegister pm) { SVEIncDecPredicateCountVector(0, 0, 0b00, 0b00, size, zdn, pm); } void uqincp(SubRegSize size, ZRegister zdn, PRegister pm) { SVEIncDecPredicateCountVector(0, 0, 0b00, 0b01, size, zdn, pm); } void sqdecp(SubRegSize size, ZRegister zdn, PRegister pm) { SVEIncDecPredicateCountVector(0, 0, 0b00, 0b10, size, zdn, pm); } void uqdecp(SubRegSize size, ZRegister zdn, PRegister pm) { SVEIncDecPredicateCountVector(0, 0, 0b00, 0b11, size, zdn, pm); } // SVE saturating inc/dec register by predicate count void sqincp(SubRegSize size, XRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(0, 1, 0b10, 0b00, size, rdn, pm); } void sqincp(SubRegSize size, XRegister rdn, PRegister pm, WRegister wn) { LOGMAN_THROW_A_FMT(rdn.Idx() == wn.Idx(), "rdn and wn must be the same"); SVEIncDecPredicateCountScalar(0, 1, 0b00, 0b00, size, rdn, pm); } void uqincp(SubRegSize size, XRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(0, 1, 0b10, 0b01, size, rdn, pm); } void uqincp(SubRegSize size, WRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(0, 1, 0b00, 0b01, size, rdn, pm); } void sqdecp(SubRegSize size, XRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(0, 1, 0b10, 0b10, size, rdn, pm); } void sqdecp(SubRegSize size, XRegister rdn, PRegister pm, WRegister wn) { LOGMAN_THROW_A_FMT(rdn.Idx() == wn.Idx(), "rdn and wn must be the same"); SVEIncDecPredicateCountScalar(0, 1, 0b00, 0b10, size, rdn, pm); } void uqdecp(SubRegSize size, XRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(0, 1, 0b10, 0b11, size, rdn, pm); } void uqdecp(SubRegSize size, WRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(0, 1, 0b00, 0b11, size, rdn, pm); } // SVE inc/dec vector by predicate count void incp(SubRegSize size, ZRegister zdn, PRegister pm) { SVEIncDecPredicateCountVector(1, 0, 0b00, 0b00, size, zdn, pm); } void decp(SubRegSize size, ZRegister zdn, PRegister pm) { SVEIncDecPredicateCountVector(1, 0, 0b00, 0b01, size, zdn, pm); } // SVE inc/dec register by predicate count void incp(SubRegSize size, XRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(1, 1, 0b00, 0b00, size, rdn, pm); } void decp(SubRegSize size, XRegister rdn, PRegister pm) { SVEIncDecPredicateCountScalar(1, 1, 0b00, 0b01, size, rdn, pm); } // SVE Write FFR // SVE FFR write from predicate void wrffr(PRegister pn) { SVEWriteFFR(0, 0b00, 0b000, pn.Idx(), 0b00000); } // SVE FFR initialise void setffr() { SVEWriteFFR(1, 0b00, 0b000, 0b0000, 0b00000); } // SVE Integer Multiply-Add - Unpredicated void cdot(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, Rotation rot) { SVEIntegerDotProduct(0b0001, size, zda, zn, zm, rot); } // SVE integer dot product (unpredicated) void sdot(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerDotProduct(0b0000, size, zda, zn, zm, Rotation::ROTATE_0); } void udot(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerDotProduct(0b0000, size, zda, zn, zm, Rotation::ROTATE_90); } // SVE2 saturating multiply-add interleaved long void sqdmlalbt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2SaturatingMulAddInterleaved(0b000010, size, zda, zn, zm); } void sqdmlslbt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2SaturatingMulAddInterleaved(0b000011, size, zda, zn, zm); } // SVE2 complex integer multiply-add void cmla(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, Rotation rot) { SVEIntegerComplexMulAdd(0b0010, size, zda, zn, zm, rot); } void sqrdcmlah(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, Rotation rot) { SVEIntegerComplexMulAdd(0b0011, size, zda, zn, zm, rot); } // SVE2 integer multiply-add long void smlalb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'000, size, zda, zn, zm); } void smlalt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'001, size, zda, zn, zm); } void umlalb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'010, size, zda, zn, zm); } void umlalt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'011, size, zda, zn, zm); } void smlslb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'100, size, zda, zn, zm); } void smlslt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'101, size, zda, zn, zm); } void umlslb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'110, size, zda, zn, zm); } void umlslt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b010'111, size, zda, zn, zm); } // SVE2 saturating multiply-add long void sqdmlalb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b0110'00, size, zda, zn, zm); } void sqdmlalt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b0110'01, size, zda, zn, zm); } void sqdmlslb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b0110'10, size, zda, zn, zm); } void sqdmlslt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerMulAddLong(0b0110'11, size, zda, zn, zm); } // SVE2 saturating multiply-add high void sqrdmlah(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerMultiplyAddUnpredicated(0b011'100, size, zda, zn, zm); } void sqrdmlsh(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerMultiplyAddUnpredicated(0b011'101, size, zda, zn, zm); } // SVE mixed sign dot product void usdot(ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerDotProduct(0b0111, SubRegSize::i32Bit, zda, zn, zm, Rotation::ROTATE_180); } // SVE2 Integer - Predicated // SVE2 integer pairwise add and accumulate long void sadalp(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn) { SVE2IntegerPairwiseAddAccumulateLong(0, size, zda, pg, zn); } void uadalp(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn) { SVE2IntegerPairwiseAddAccumulateLong(1, size, zda, pg, zn); } // SVE2 integer unary operations (predicated) void urecpe(ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVE2IntegerUnaryOpsPredicated(0b00000, SubRegSize::i32Bit, zd, pg, zn); } void ursqrte(ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVE2IntegerUnaryOpsPredicated(0b00001, SubRegSize::i32Bit, zd, pg, zn); } void sqabs(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVE2IntegerUnaryOpsPredicated(0b01000, size, zd, pg, zn); } void sqneg(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVE2IntegerUnaryOpsPredicated(0b01001, size, zd, pg, zn); } // SVE2 saturating/rounding bitwise shift left (predicated) void srshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b00010, size, zd, pg, zn, zm); } void urshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b00011, size, zd, pg, zn, zm); } void srshlr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b00110, size, zd, pg, zn, zm); } void urshlr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b00111, size, zd, pg, zn, zm); } void sqshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01000, size, zd, pg, zn, zm); } void uqshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01001, size, zd, pg, zn, zm); } void sqrshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01010, size, zd, pg, zn, zm); } void uqrshl(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01011, size, zd, pg, zn, zm); } void sqshlr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01100, size, zd, pg, zn, zm); } void uqshlr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01101, size, zd, pg, zn, zm); } void sqrshlr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01110, size, zd, pg, zn, zm); } void uqrshlr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2SaturatingRoundingBitwiseShiftLeft(0b01111, size, zd, pg, zn, zm); } // SVE2 integer halving add/subtract (predicated) void shadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b000, size, pg, zd, zn, zm); } void uhadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b001, size, pg, zd, zn, zm); } void shsub(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b010, size, pg, zd, zn, zm); } void uhsub(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b011, size, pg, zd, zn, zm); } void srhadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b100, size, pg, zd, zn, zm); } void urhadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b101, size, pg, zd, zn, zm); } void shsubr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b110, size, pg, zd, zn, zm); } void uhsubr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerHalvingPredicated(0b111, size, pg, zd, zn, zm); } // SVE2 integer pairwise arithmetic void addp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerPairwiseArithmetic(0b00, 1, size, pg, zd, zn, zm); } void smaxp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerPairwiseArithmetic(0b10, 0, size, pg, zd, zn, zm); } void umaxp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerPairwiseArithmetic(0b10, 1, size, pg, zd, zn, zm); } void sminp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerPairwiseArithmetic(0b11, 0, size, pg, zd, zn, zm); } void uminp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEIntegerPairwiseArithmetic(0b11, 1, size, pg, zd, zn, zm); } // SVE2 saturating add/subtract void sqadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b000, size, zd, pg, zn, zm); } void uqadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b001, size, zd, pg, zn, zm); } void sqsub(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b010, size, zd, pg, zn, zm); } void uqsub(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b011, size, zd, pg, zn, zm); } void suqadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b100, size, zd, pg, zn, zm); } void usqadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b101, size, zd, pg, zn, zm); } void sqsubr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b110, size, zd, pg, zn, zm); } void uqsubr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVE2IntegerSaturatingAddSub(0b111, size, zd, pg, zn, zm); } // SVE2 Widening Integer Arithmetic // SVE2 integer add/subtract long void saddlb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b000, size, zd, zn, zm); } void saddlt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b001, size, zd, zn, zm); } void uaddlb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b010, size, zd, zn, zm); } void uaddlt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b011, size, zd, zn, zm); } void ssublb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b100, size, zd, zn, zm); } void ssublt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b101, size, zd, zn, zm); } void usublb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b110, size, zd, zn, zm); } void usublt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(0, 0b111, size, zd, zn, zm); } void sabdlb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(1, 0b100, size, zd, zn, zm); } void sabdlt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(1, 0b101, size, zd, zn, zm); } void uabdlb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(1, 0b110, size, zd, zn, zm); } void uabdlt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLong(1, 0b111, size, zd, zn, zm); } // SVE2 integer add/subtract wide void saddwb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b000, size, zd, zn, zm); } void saddwt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b001, size, zd, zn, zm); } void uaddwb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b010, size, zd, zn, zm); } void uaddwt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b011, size, zd, zn, zm); } void ssubwb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b100, size, zd, zn, zm); } void ssubwt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b101, size, zd, zn, zm); } void usubwb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b110, size, zd, zn, zm); } void usubwt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubWide(0b111, size, zd, zn, zm); } // SVE2 integer multiply long void sqdmullb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b000, size, zd, zn, zm); } void sqdmullt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b001, size, zd, zn, zm); } void pmullb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b010, size, zd, zn, zm); } void pmullt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b011, size, zd, zn, zm); } void smullb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b100, size, zd, zn, zm); } void smullt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b101, size, zd, zn, zm); } void umullb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b110, size, zd, zn, zm); } void umullt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerMultiplyLong(0b111, size, zd, zn, zm); } // // SVE Misc // SVE2 bitwise shift left long void sshllb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftLeftLong(size, 0b00, zd, zn, shift); } void sshllt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftLeftLong(size, 0b01, zd, zn, shift); } void ushllb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftLeftLong(size, 0b10, zd, zn, shift); } void ushllt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftLeftLong(size, 0b11, zd, zn, shift); } // SVE2 integer add/subtract interleaved long void saddlbt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b00, zd, zn, zm); } void ssublbt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b10, zd, zn, zm); } void ssubltb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b11, zd, zn, zm); } // SVE2 bitwise exclusive-or interleaved void eorbt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2BitwiseXorInterleaved(size, 0b0, zd, zn, zm); } void eortb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2BitwiseXorInterleaved(size, 0b1, zd, zn, zm); } // SVE integer matrix multiply accumulate void smmla(ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerMatrixMulAccumulate(0b00, zda, zn, zm); } void usmmla(ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerMatrixMulAccumulate(0b10, zda, zn, zm); } void ummla(ZRegister zda, ZRegister zn, ZRegister zm) { SVEIntegerMatrixMulAccumulate(0b11, zda, zn, zm); } // SVE2 bitwise permute void bext(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2BitwisePermute(size, 0b00, zd, zn, zm); } void bdep(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2BitwisePermute(size, 0b01, zd, zn, zm); } void bgrp(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2BitwisePermute(size, 0b10, zd, zn, zm); } // SVE2 Accumulate // SVE2 complex integer add void cadd(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm, Rotation rot) { SVE2ComplexIntAdd(size, 0b0, rot, zd, zn, zm); } void sqcadd(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm, Rotation rot) { SVE2ComplexIntAdd(size, 0b1, rot, zd, zn, zm); } // SVE2 integer absolute difference and accumulate long void sabalb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b10000, zda, zn, zm); } void sabalt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b10001, zda, zn, zm); } void uabalb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b10010, zda, zn, zm); } void uabalt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubInterleavedLong(size, 0b10011, zda, zn, zm); } // SVE2 integer add/subtract long with carry void adclb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLongWithCarry(size, 0, 0, zda, zn, zm); } void adclt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLongWithCarry(size, 0, 1, zda, zn, zm); } void sbclb(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLongWithCarry(size, 1, 0, zda, zn, zm); } void sbclt(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubLongWithCarry(size, 1, 1, zda, zn, zm); } // SVE2 bitwise shift right and accumulate void ssra(SubRegSize size, ZRegister zda, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftRightAndAccumulate(size, 0b00, zda, zn, shift); } void usra(SubRegSize size, ZRegister zda, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftRightAndAccumulate(size, 0b01, zda, zn, shift); } void srsra(SubRegSize size, ZRegister zda, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftRightAndAccumulate(size, 0b10, zda, zn, shift); } void ursra(SubRegSize size, ZRegister zda, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftRightAndAccumulate(size, 0b11, zda, zn, shift); } // SVE2 bitwise shift and insert void sri(SubRegSize size, ZRegister zda, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftAndInsert(size, 0b0, zda, zn, shift); } void sli(SubRegSize size, ZRegister zda, ZRegister zn, uint32_t shift) { SVE2BitwiseShiftAndInsert(size, 0b1, zda, zn, shift); } // SVE2 integer absolute difference and accumulate void saba(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAbsDiffAndAccumulate(size, 0b0, zda, zn, zm); } void uaba(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2IntegerAbsDiffAndAccumulate(size, 0b1, zda, zn, zm); } // SVE2 Narrowing // SVE2 saturating extract narrow void sqxtnb(SubRegSize size, ZRegister zd, ZRegister zn) { SVE2SaturatingExtractNarrow(size, 0b00, 0, zn, zd); } void sqxtnt(SubRegSize size, ZRegister zd, ZRegister zn) { SVE2SaturatingExtractNarrow(size, 0b00, 1, zn, zd); } void uqxtnb(SubRegSize size, ZRegister zd, ZRegister zn) { SVE2SaturatingExtractNarrow(size, 0b01, 0, zn, zd); } void uqxtnt(SubRegSize size, ZRegister zd, ZRegister zn) { SVE2SaturatingExtractNarrow(size, 0b01, 1, zn, zd); } void sqxtunb(SubRegSize size, ZRegister zd, ZRegister zn) { SVE2SaturatingExtractNarrow(size, 0b10, 0, zn, zd); } void sqxtunt(SubRegSize size, ZRegister zd, ZRegister zn) { SVE2SaturatingExtractNarrow(size, 0b10, 1, zn, zd); } // SVE2 bitwise shift right narrow void sqshrunb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 0, 0, 0, zn, zd); } void sqshrunt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 0, 0, 1, zn, zd); } void sqrshrunb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 0, 1, 0, zn, zd); } void sqrshrunt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 0, 1, 1, zn, zd); } void shrnb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 1, 0, 0, zn, zd); } void shrnt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 1, 0, 1, zn, zd); } void rshrnb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 1, 1, 0, zn, zd); } void rshrnt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 0, 1, 1, 1, zn, zd); } void sqshrnb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 0, 0, 0, zn, zd); } void sqshrnt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 0, 0, 1, zn, zd); } void sqrshrnb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 0, 1, 0, zn, zd); } void sqrshrnt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 0, 1, 1, zn, zd); } void uqshrnb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 1, 0, 0, zn, zd); } void uqshrnt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 1, 0, 1, zn, zd); } void uqrshrnb(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 1, 1, 0, zn, zd); } void uqrshrnt(SubRegSize size, ZRegister zd, ZRegister zn, uint32_t Shift) { SVE2BitwiseShiftRightNarrow(size, Shift, 1, 1, 1, 1, zn, zd); } // SVE2 integer add/subtract narrow high part void addhnb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b000, zd, zn, zm); } void addhnt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b001, zd, zn, zm); } void raddhnb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b010, zd, zn, zm); } void raddhnt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b011, zd, zn, zm); } void subhnb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b100, zd, zn, zm); } void subhnt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b101, zd, zn, zm); } void rsubhnb(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b110, zd, zn, zm); } void rsubhnt(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { SVE2IntegerAddSubNarrowHighPart(size, 0b111, zd, zn, zm); } // SVE2 Crypto Extensions // SVE2 crypto unary operations void aesimc(ZRegister zdn, ZRegister zn) { SVE2CryptoUnaryOperation(1, zdn, zn); } void aesmc(ZRegister zdn, ZRegister zn) { SVE2CryptoUnaryOperation(0, zdn, zn); } // SVE2 crypto destructive binary operations void aese(ZRegister zdn, ZRegister zn, ZRegister zm) { SVE2CryptoDestructiveBinaryOperation(0, 0, zdn, zn, zm); } void aesd(ZRegister zdn, ZRegister zn, ZRegister zm) { SVE2CryptoDestructiveBinaryOperation(0, 1, zdn, zn, zm); } void sm4e(ZRegister zdn, ZRegister zn, ZRegister zm) { SVE2CryptoDestructiveBinaryOperation(1, 0, zdn, zn, zm); } // SVE2 crypto constructive binary operations void sm4ekey(ZRegister zd, ZRegister zn, ZRegister zm) { SVE2CryptoConstructiveBinaryOperation(0, zd, zn, zm); } void rax1(ZRegister zd, ZRegister zn, ZRegister zm) { SVE2CryptoConstructiveBinaryOperation(1, zd, zn, zm); } // SVE Floating Point Widening Multiply-Add - Indexed // SVE BFloat16 floating-point dot product (indexed) // XXX: // SVE floating-point multiply-add long (indexed) void fmlalb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(0, 0, 0, dstsize, zda, zn, zm, index); } void fmlalt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(0, 0, 1, dstsize, zda, zn, zm, index); } void fmlslb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(0, 1, 0, dstsize, zda, zn, zm, index); } void fmlslt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(0, 1, 1, dstsize, zda, zn, zm, index); } void bfmlalb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(1, 0, 0, dstsize, zda, zn, zm, index); } void bfmlalt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(1, 0, 1, dstsize, zda, zn, zm, index); } void bfmlslb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(1, 1, 0, dstsize, zda, zn, zm, index); } void bfmlslt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { SVEFPMultiplyAddLongIndexed(1, 1, 1, dstsize, zda, zn, zm, index); } // SVE Floating Point Widening Multiply-Add // SVE BFloat16 floating-point dot product // XXX: // SVE floating-point multiply-add long void fmlalb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(0, 0, 0, dstsize, zda, zn, zm); } void fmlalt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(0, 0, 1, dstsize, zda, zn, zm); } void fmlslb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(0, 1, 0, dstsize, zda, zn, zm); } void fmlslt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(0, 1, 1, dstsize, zda, zn, zm); } void bfmlalb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(1, 0, 0, dstsize, zda, zn, zm); } void bfmlalt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(1, 0, 1, dstsize, zda, zn, zm); } void bfmlslb(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(1, 1, 0, dstsize, zda, zn, zm); } void bfmlslt(SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { SVEFPMultiplyAddLong(1, 1, 1, dstsize, zda, zn, zm); } // SVE Floating Point Arithmetic - Predicated void ftmad(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm, uint32_t imm) { LOGMAN_THROW_A_FMT(imm <= 7, "ftmad immediate must be within 0-7"); SVEFloatArithmeticPredicated(0b10000 | imm, size, PReg::p0, zd, zn, zm); } // SVE floating-point arithmetic (predicated) void fadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0000, size, pg, zd, zn, zm); } void fsub(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0001, size, pg, zd, zn, zm); } void fmul(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0010, size, pg, zd, zn, zm); } void fsubr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0011, size, pg, zd, zn, zm); } void fmaxnm(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0100, size, pg, zd, zn, zm); } void fminnm(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0101, size, pg, zd, zn, zm); } void fmax(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0110, size, pg, zd, zn, zm); } void fmin(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b0111, size, pg, zd, zn, zm); } void fabd(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b1000, size, pg, zd, zn, zm); } void fscale(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b1001, size, pg, zd, zn, zm); } void fmulx(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b1010, size, pg, zd, zn, zm); } void fdivr(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b1100, size, pg, zd, zn, zm); } void fdiv(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFloatArithmeticPredicated(0b1101, size, pg, zd, zn, zm); } // SVE floating-point arithmetic with immediate (predicated) void fadd(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFAddSubImm imm) { SVEFPArithWithImmediate(0b000, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fsub(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFAddSubImm imm) { SVEFPArithWithImmediate(0b001, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fmul(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFMulImm imm) { SVEFPArithWithImmediate(0b010, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fsubr(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFAddSubImm imm) { SVEFPArithWithImmediate(0b011, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fmaxnm(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFMaxMinImm imm) { SVEFPArithWithImmediate(0b100, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fminnm(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFMaxMinImm imm) { SVEFPArithWithImmediate(0b101, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fmax(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFMaxMinImm imm) { SVEFPArithWithImmediate(0b110, size, zd, pg, FEXCore::ToUnderlying(imm)); } void fmin(SubRegSize size, ZRegister zd, PRegisterMerge pg, SVEFMaxMinImm imm) { SVEFPArithWithImmediate(0b111, size, zd, pg, FEXCore::ToUnderlying(imm)); } // SVE Floating Point Unary Operations - Predicated // SVE floating-point round to integral value void frinti(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b111, size, zd, pg, zn); } void frintx(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b110, size, zd, pg, zn); } void frinta(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b100, size, zd, pg, zn); } void frintn(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b000, size, zd, pg, zn); } void frintz(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b011, size, zd, pg, zn); } void frintm(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b010, size, zd, pg, zn); } void frintp(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatRoundIntegral(0b001, size, zd, pg, zn); } // SVE floating-point convert precision void fcvt(SubRegSize to, SubRegSize from, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFPConvertPrecision(to, from, zd, pg, zn); } void fcvtx(ZRegister zd, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0101'0000'1010'1010'0000'0000'0000; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE floating-point unary operations void frecpx(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatUnary(0b00, size, pg, zn, zd); } void fsqrt(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVEFloatUnary(0b01, size, pg, zn, zd); } // SVE integer convert to floating-point void scvtf(ZRegister zd, SubRegSize dstsize, PRegisterMerge pg, ZRegister zn, SubRegSize srcsize) { uint32_t opc1, opc2; if (srcsize == SubRegSize::i16Bit) { // Srcsize = fp16, opc2 encodes dst size LOGMAN_THROW_A_FMT(dstsize == SubRegSize::i16Bit, "Unsupported size in {}", __func__); opc1 = 0b01; opc2 = 0b01; } else if (srcsize == SubRegSize::i32Bit) { // Srcsize = fp32, opc1 encodes dst size opc1 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b01 : 0b00; opc2 = dstsize == SubRegSize::i64Bit ? 0b00 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b10 : 0b00; } else if (srcsize == SubRegSize::i64Bit) { // SrcSize = fp64, opc2 encodes dst size opc1 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b11 : dstsize == SubRegSize::i16Bit ? 0b01 : 0b00; opc2 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b11 : 0b00; } else { FEX_UNREACHABLE; } SVEIntegerConvertToFloat(dstsize, srcsize, opc1, opc2, 0, pg, zn, zd); } void ucvtf(ZRegister zd, SubRegSize dstsize, PRegisterMerge pg, ZRegister zn, SubRegSize srcsize) { uint32_t opc1, opc2; if (srcsize == SubRegSize::i16Bit) { // Srcsize = fp16, opc2 encodes dst size LOGMAN_THROW_A_FMT(dstsize == SubRegSize::i16Bit, "Unsupported size in {}", __func__); opc1 = 0b01; opc2 = 0b01; } else if (srcsize == SubRegSize::i32Bit) { // Srcsize = fp32, opc1 encodes dst size opc1 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b01 : 0b00; opc2 = dstsize == SubRegSize::i64Bit ? 0b00 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b10 : 0b00; } else if (srcsize == SubRegSize::i64Bit) { // SrcSize = fp64, opc2 encodes dst size opc1 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b11 : dstsize == SubRegSize::i16Bit ? 0b01 : 0b00; opc2 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b11 : 0b00; } else { FEX_UNREACHABLE; } SVEIntegerConvertToFloat(dstsize, srcsize, opc1, opc2, 1, pg, zn, zd); } // SVE floating-point convert to integer void flogb(SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { const auto ConvertedSize = size == SubRegSize::i64Bit ? 0b11 : size == SubRegSize::i32Bit ? 0b10 : size == SubRegSize::i16Bit ? 0b01 : 0b00; SVEFloatConvertToInt(size, size, 1, 0b00, ConvertedSize, 0, pg, zn, zd); } void fcvtzs(ZRegister zd, SubRegSize dstsize, PRegisterMerge pg, ZRegister zn, SubRegSize srcsize) { uint32_t opc1, opc2; if (srcsize == SubRegSize::i16Bit) { // Srcsize = fp16, opc2 encodes dst size opc1 = 0b01; opc2 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b01 : 0b00; } else if (srcsize == SubRegSize::i32Bit) { // Srcsize = fp32, opc1 encodes dst size LOGMAN_THROW_A_FMT(dstsize != SubRegSize::i16Bit, "Unsupported size in {}", __func__); opc1 = dstsize == SubRegSize::i64Bit ? 0b11 : 0b10; opc2 = 0b10; } else if (srcsize == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(dstsize != SubRegSize::i16Bit, "Unsupported size in {}", __func__); // SrcSize = fp64, opc2 encodes dst size opc1 = 0b11; opc2 = dstsize == SubRegSize::i64Bit ? 0b11 : 0b00; } else { FEX_UNREACHABLE; } SVEFloatConvertToInt(dstsize, srcsize, 1, opc1, opc2, 0, pg, zn, zd); } void fcvtzu(ZRegister zd, SubRegSize dstsize, PRegisterMerge pg, ZRegister zn, SubRegSize srcsize) { uint32_t opc1, opc2; if (srcsize == SubRegSize::i16Bit) { // Srcsize = fp16, opc2 encodes dst size opc1 = 0b01; opc2 = dstsize == SubRegSize::i64Bit ? 0b11 : dstsize == SubRegSize::i32Bit ? 0b10 : dstsize == SubRegSize::i16Bit ? 0b01 : 0b00; } else if (srcsize == SubRegSize::i32Bit) { // Srcsize = fp32, opc1 encodes dst size LOGMAN_THROW_A_FMT(dstsize != SubRegSize::i16Bit, "Unsupported size in {}", __func__); opc1 = dstsize == SubRegSize::i64Bit ? 0b11 : 0b10; opc2 = 0b10; } else if (srcsize == SubRegSize::i64Bit) { LOGMAN_THROW_A_FMT(dstsize != SubRegSize::i16Bit, "Unsupported size in {}", __func__); // SrcSize = fp64, opc2 encodes dst size opc1 = 0b11; opc2 = dstsize == SubRegSize::i64Bit ? 0b11 : 0b00; } else { FEX_UNREACHABLE; } SVEFloatConvertToInt(dstsize, srcsize, 1, opc1, opc2, 1, pg, zn, zd); } // SVE Floating Point Unary Operations - Unpredicated // SVE floating-point reciprocal estimate (unpredicated) void frecpe(SubRegSize size, ZRegister zd, ZRegister zn) { SVEFPUnaryOpsUnpredicated(0b110, size, zd, zn); } void frsqrte(SubRegSize size, ZRegister zd, ZRegister zn) { SVEFPUnaryOpsUnpredicated(0b111, size, zd, zn); } // SVE Floating Point Compare - with Zero // SVE floating-point compare with zero void fcmge(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn) { SVEFPCompareWithZero(0b00, 0, size, pd, pg, zn); } void fcmgt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn) { SVEFPCompareWithZero(0b00, 1, size, pd, pg, zn); } void fcmlt(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn) { SVEFPCompareWithZero(0b01, 0, size, pd, pg, zn); } void fcmle(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn) { SVEFPCompareWithZero(0b01, 1, size, pd, pg, zn); } void fcmeq(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn) { SVEFPCompareWithZero(0b10, 0, size, pd, pg, zn); } void fcmne(SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn) { SVEFPCompareWithZero(0b11, 0, size, pd, pg, zn); } // SVE Floating Point Accumulating Reduction // SVE floating-point serial reduction (predicated) void fadda(SubRegSize size, VRegister vd, PRegister pg, VRegister vn, ZRegister zm) { SVEFPSerialReductionPredicated(0b00, size, vd, pg, vn, zm); } // SVE Floating Point Multiply-Add // SVE floating-point multiply-accumulate writing addend void fmla(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFPMultiplyAdd(0b000, size, zda, pg, zn, zm); } void fmls(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFPMultiplyAdd(0b001, size, zda, pg, zn, zm); } void fnmla(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFPMultiplyAdd(0b010, size, zda, pg, zn, zm); } void fnmls(SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn, ZRegister zm) { SVEFPMultiplyAdd(0b011, size, zda, pg, zn, zm); } // SVE floating-point multiply-accumulate writing multiplicand void fmad(SubRegSize size, ZRegister zdn, PRegisterMerge pg, ZRegister zm, ZRegister za) { SVEFPMultiplyAdd(0b100, size, zdn, pg, zm, za); } void fmsb(SubRegSize size, ZRegister zdn, PRegisterMerge pg, ZRegister zm, ZRegister za) { SVEFPMultiplyAdd(0b101, size, zdn, pg, zm, za); } void fnmad(SubRegSize size, ZRegister zdn, PRegisterMerge pg, ZRegister zm, ZRegister za) { SVEFPMultiplyAdd(0b110, size, zdn, pg, zm, za); } void fnmsb(SubRegSize size, ZRegister zdn, PRegisterMerge pg, ZRegister zm, ZRegister za) { SVEFPMultiplyAdd(0b111, size, zdn, pg, zm, za); } // SVE Memory - 32-bit Gather and Unsized Contiguous void ldr(PRegister pt, XRegister rn, int32_t imm = 0) { SVEUnsizedLoadStoreContiguous(0b0, imm, ZRegister {pt.Idx()}, rn, false); } void ldr(ZRegister zt, XRegister rn, int32_t imm = 0) { SVEUnsizedLoadStoreContiguous(0b1, imm, zt, rn, false); } // SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets) // XXX: // SVE contiguous prefetch (scalar plus immediate) // XXX: // SVE2 32-bit gather non-temporal load (vector plus scalar) // XXX: // SVE contiguous prefetch (scalar plus scalar) // XXX: // SVE 32-bit gather prefetch (vector plus immediate) // XXX: // SVE load and broadcast element void ld1rb(SubRegSize esize, ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(false, esize, SubRegSize::i8Bit, zt, pg, rn, imm); } void ld1rsb(SubRegSize esize, ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(true, esize, SubRegSize::i8Bit, zt, pg, rn, imm); } void ld1rh(SubRegSize esize, ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(false, esize, SubRegSize::i16Bit, zt, pg, rn, imm); } void ld1rsh(SubRegSize esize, ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(true, esize, SubRegSize::i16Bit, zt, pg, rn, imm); } void ld1rw(SubRegSize esize, ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(false, esize, SubRegSize::i32Bit, zt, pg, rn, imm); } void ld1rsw(ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(true, SubRegSize::i64Bit, SubRegSize::i32Bit, zt, pg, rn, imm); } void ld1rd(ZRegister zt, PRegisterZero pg, Register rn, uint32_t imm = 0) { SVELoadAndBroadcastElement(false, SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, rn, imm); } // SVE contiguous non-temporal load (scalar plus immediate) void ldnt1b(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalLoad(0b00, zt, pg, rn, Imm); } void ldnt1h(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalLoad(0b01, zt, pg, rn, Imm); } void ldnt1w(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalLoad(0b10, zt, pg, rn, Imm); } void ldnt1d(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalLoad(0b11, zt, pg, rn, Imm); } // SVE contiguous non-temporal load (scalar plus scalar) // XXX: // SVE load multiple structures (scalar plus immediate) void ld2b(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, false, 0b00, Imm, zt1, pg, rn); } void ld3b(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, false, 0b00, Imm, zt1, pg, rn); } void ld4b(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, false, 0b00, Imm, zt1, pg, rn); } void ld2h(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, false, 0b01, Imm, zt1, pg, rn); } void ld3h(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, false, 0b01, Imm, zt1, pg, rn); } void ld4h(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, false, 0b01, Imm, zt1, pg, rn); } void ld2w(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, false, 0b10, Imm, zt1, pg, rn); } void ld3w(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, false, 0b10, Imm, zt1, pg, rn); } void ld4w(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, false, 0b10, Imm, zt1, pg, rn); } void ld2d(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, false, 0b11, Imm, zt1, pg, rn); } void ld3d(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, false, 0b11, Imm, zt1, pg, rn); } void ld4d(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, false, 0b11, Imm, zt1, pg, rn); } // SVE helper implementations template void ld1b(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1b(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1b(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i8Bit, zt, pg, Src, true, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i8Bit, zt, pg, Src, true, false); } else { FEX_UNREACHABLE; } } template void ldff1b(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1b(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1b doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i8Bit, zt, pg, Src, true, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i8Bit, zt, pg, Src, true, true); } else { FEX_UNREACHABLE; } } void ld1sw(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1sw(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1sw(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(SubRegSize::i64Bit, SubRegSize::i32Bit, zt, pg, Src, false, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(SubRegSize::i64Bit, SubRegSize::i32Bit, zt, pg, Src, false, false); } else { FEX_UNREACHABLE; } } template void ld1h(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1h(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1h(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i16Bit, zt, pg, Src, true, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i16Bit, zt, pg, Src, true, false); } else { FEX_UNREACHABLE; } } template void ld1sh(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1sh(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1sh(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i16Bit, zt, pg, Src, false, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i16Bit, zt, pg, Src, false, false); } else { FEX_UNREACHABLE; } } template void ldff1h(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1h(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1h doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i16Bit, zt, pg, Src, true, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i16Bit, zt, pg, Src, true, true); } else { FEX_UNREACHABLE; } } template void ldff1sh(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1sh(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1sh doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i16Bit, zt, pg, Src, false, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i16Bit, zt, pg, Src, false, true); } else { FEX_UNREACHABLE; } } template void ld1w(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1w(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1w(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i32Bit, zt, pg, Src, true, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i32Bit, zt, pg, Src, true, false); } else { FEX_UNREACHABLE; } } template void ldff1w(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1w(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1w doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i32Bit, zt, pg, Src, true, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i32Bit, zt, pg, Src, true, true); } else { FEX_UNREACHABLE; } } void ldff1sw(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1sw(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1sw doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(SubRegSize::i64Bit, SubRegSize::i32Bit, zt, pg, Src, false, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(SubRegSize::i64Bit, SubRegSize::i32Bit, zt, pg, Src, false, true); } else { FEX_UNREACHABLE; } } template void ld1sb(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1sb(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1sb(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i8Bit, zt, pg, Src, false, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i8Bit, zt, pg, Src, false, false); } else { FEX_UNREACHABLE; } } template void ldff1sb(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1sb(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1sb doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(size, SubRegSize::i8Bit, zt, pg, Src, false, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(size, SubRegSize::i8Bit, zt, pg, Src, false, true); } else { FEX_UNREACHABLE; } } void ld1d(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ld1d(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { ld1d(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, Src, true, false); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, Src, true, false); } else { FEX_UNREACHABLE; } } void ldff1d(ZRegister zt, PRegisterZero pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { ldff1d(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { LOGMAN_THROW_A_FMT(false, "ldff1d doesn't have a scalar plus immediate variant"); } else if (Src.IsScalarPlusVector()) { SVEGatherLoadScalarPlusVector(SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, Src, true, true); } else if (Src.IsVectorPlusImm()) { SVEGatherLoadVectorPlusImm(SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, Src, true, true); } else { FEX_UNREACHABLE; } } template void st1b(ZRegister zt, PRegister pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { st1b(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { st1b(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEScatterStoreScalarPlusVector(size, SubRegSize::i8Bit, zt, pg, Src); } else if (Src.IsVectorPlusImm()) { SVEScatterStoreVectorPlusImm(size, SubRegSize::i8Bit, zt, pg, Src); } else { FEX_UNREACHABLE; } } template void st1h(ZRegister zt, PRegister pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { st1h(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { st1h(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEScatterStoreScalarPlusVector(size, SubRegSize::i16Bit, zt, pg, Src); } else if (Src.IsVectorPlusImm()) { SVEScatterStoreVectorPlusImm(size, SubRegSize::i16Bit, zt, pg, Src); } else { FEX_UNREACHABLE; } } template void st1w(ZRegister zt, PRegister pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { st1w(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { st1w(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEScatterStoreScalarPlusVector(size, SubRegSize::i32Bit, zt, pg, Src); } else if (Src.IsVectorPlusImm()) { SVEScatterStoreVectorPlusImm(size, SubRegSize::i32Bit, zt, pg, Src); } else { FEX_UNREACHABLE; } } void st1d(ZRegister zt, PRegister pg, SVEMemOperand Src) { if (Src.IsScalarPlusScalar()) { st1d(zt, pg, Src.rn, Src.MetaType.ScalarScalarType.rm); } else if (Src.IsScalarPlusImm()) { st1d(zt, pg, Src.rn, Src.MetaType.ScalarImmType.Imm); } else if (Src.IsScalarPlusVector()) { SVEScatterStoreScalarPlusVector(SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, Src); } else if (Src.IsVectorPlusImm()) { SVEScatterStoreVectorPlusImm(SubRegSize::i64Bit, SubRegSize::i64Bit, zt, pg, Src); } else { FEX_UNREACHABLE; } } // SVE load multiple structures (scalar plus scalar) void ld2b(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i8Bit, 0b01, zt1, pg, rn, rm); } void ld3b(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i8Bit, 0b10, zt1, pg, rn, rm); } void ld4b(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i8Bit, 0b11, zt1, pg, rn, rm); } void ld2h(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i16Bit, 0b01, zt1, pg, rn, rm); } void ld3h(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i16Bit, 0b10, zt1, pg, rn, rm); } void ld4h(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i16Bit, 0b11, zt1, pg, rn, rm); } void ld2w(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i32Bit, 0b01, zt1, pg, rn, rm); } void ld3w(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i32Bit, 0b10, zt1, pg, rn, rm); } void ld4w(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i32Bit, 0b11, zt1, pg, rn, rm); } void ld2d(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i64Bit, 0b01, zt1, pg, rn, rm); } void ld3d(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i64Bit, 0b10, zt1, pg, rn, rm); } void ld4d(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i64Bit, 0b11, zt1, pg, rn, rm); } // SVE load and broadcast quadword (scalar plus immediate) void ld1rqb(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b00, 0b00, zt, pg, rn, imm); } void ld1rob(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b00, 0b01, zt, pg, rn, imm); } void ld1rqh(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b01, 0b00, zt, pg, rn, imm); } void ld1roh(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b01, 0b01, zt, pg, rn, imm); } void ld1rqw(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b10, 0b00, zt, pg, rn, imm); } void ld1row(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b10, 0b01, zt, pg, rn, imm); } void ld1rqd(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b11, 0b00, zt, pg, rn, imm); } void ld1rod(ZRegister zt, PRegisterZero pg, Register rn, int imm = 0) { SVELoadBroadcastQuadScalarPlusImm(0b11, 0b01, zt, pg, rn, imm); } // SVE contiguous load (scalar plus immediate) template void ld1b(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { SVEContiguousLoadImm(false, 0b0000 | FEXCore::ToUnderlying(size), Imm, pg, rn, zt); } void ld1sw(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { SVEContiguousLoadImm(false, 0b0100, Imm, pg, rn, zt); } template void ld1h(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { static_assert(size != SubRegSize::i8Bit, "Invalid size"); SVEContiguousLoadImm(false, 0b0100 | FEXCore::ToUnderlying(size), Imm, pg, rn, zt); } template void ld1sh(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 1 : size == SubRegSize::i64Bit ? 0 : -1; SVEContiguousLoadImm(false, 0b1000 | ConvertedSize, Imm, pg, rn, zt); } template void ld1w(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 0 : size == SubRegSize::i64Bit ? 1 : -1; SVEContiguousLoadImm(false, 0b1010 | ConvertedSize, Imm, pg, rn, zt); } template void ld1sb(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { static_assert(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i16Bit ? 0b10 : size == SubRegSize::i32Bit ? 0b01 : size == SubRegSize::i64Bit ? 0b00 : -1; SVEContiguousLoadImm(false, 0b1100 | ConvertedSize, Imm, pg, rn, zt); } void ld1d(ZRegister zt, PRegisterZero pg, Register rn, int32_t Imm = 0) { SVEContiguousLoadImm(false, 0b1111, Imm, pg, rn, zt); } // SVE contiguous non-fault load (scalar plus immediate) // XXX: // SVE load and broadcast quadword (scalar plus scalar) void ld1rqb(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b00, 0b00, zt, pg, rn, rm); } void ld1rob(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b00, 0b01, zt, pg, rn, rm); } void ld1rqh(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b01, 0b00, zt, pg, rn, rm); } void ld1roh(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b01, 0b01, zt, pg, rn, rm); } void ld1rqw(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b10, 0b00, zt, pg, rn, rm); } void ld1row(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b10, 0b01, zt, pg, rn, rm); } void ld1rqd(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b11, 0b00, zt, pg, rn, rm); } void ld1rod(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVELoadBroadcastQuadScalarPlusScalar(0b11, 0b01, zt, pg, rn, rm); } // SVE contiguous load (scalar plus scalar) template void ld1b(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVEContiguousLoadStore(0, 0, 0b0000 | FEXCore::ToUnderlying(size), rm, pg, rn, zt); } void ld1sw(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVEContiguousLoadStore(0, 0, 0b0100, rm, pg, rn, zt); } template void ld1h(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size != SubRegSize::i8Bit, "Invalid size"); SVEContiguousLoadStore(0, 0, 0b0100 | FEXCore::ToUnderlying(size), rm, pg, rn, zt); } template void ld1sh(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 1 : size == SubRegSize::i64Bit ? 0 : -1; SVEContiguousLoadStore(0, 0, 0b1000 | ConvertedSize, rm, pg, rn, zt); } template void ld1w(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 0 : size == SubRegSize::i64Bit ? 1 : -1; SVEContiguousLoadStore(0, 0, 0b1010 | ConvertedSize, rm, pg, rn, zt); } template void ld1sb(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i16Bit ? 0b10 : size == SubRegSize::i32Bit ? 0b01 : size == SubRegSize::i64Bit ? 0b00 : -1; SVEContiguousLoadStore(0, 0, 0b1100 | ConvertedSize, rm, pg, rn, zt); } void ld1d(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVEContiguousLoadStore(0, 0, 0b1111, rm, pg, rn, zt); } // SVE contiguous first-fault load (scalar plus scalar) template void ldff1b(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVEContiguousLoadStore(0, 1, 0b0000 | FEXCore::ToUnderlying(size), rm, pg, rn, zt); } template void ldff1sb(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i16Bit ? 0b10 : size == SubRegSize::i32Bit ? 0b01 : size == SubRegSize::i64Bit ? 0b00 : -1; SVEContiguousLoadStore(0, 1, 0b1100 | ConvertedSize, rm, pg, rn, zt); } template void ldff1h(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size != SubRegSize::i8Bit, "Invalid size"); SVEContiguousLoadStore(0, 1, 0b0100 | FEXCore::ToUnderlying(size), rm, pg, rn, zt); } template void ldff1sh(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 1 : size == SubRegSize::i64Bit ? 0 : -1; SVEContiguousLoadStore(0, 1, 0b1000 | ConvertedSize, rm, pg, rn, zt); } template void ldff1w(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 0 : size == SubRegSize::i64Bit ? 1 : -1; SVEContiguousLoadStore(0, 1, 0b1010 | ConvertedSize, rm, pg, rn, zt); } void ldff1sw(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVEContiguousLoadStore(0, 1, 0b0100, rm, pg, rn, zt); } void ldff1d(ZRegister zt, PRegisterZero pg, Register rn, Register rm) { SVEContiguousLoadStore(0, 1, 0b1111, rm, pg, rn, zt); } // SVE Memory - 64-bit Gather // SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets) // XXX: // SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets) // XXX: // SVE 64-bit gather prefetch (vector plus immediate) // XXX: // SVE2 64-bit gather non-temporal load (vector plus scalar) // XXX: // SVE Memory - Contiguous Store and Unsized Contiguous void str(PRegister pt, XRegister rn, int32_t imm = 0) { SVEUnsizedLoadStoreContiguous(0b0, imm, ZRegister {pt.Idx()}, rn, true); } void str(ZRegister zt, XRegister rn, int32_t imm = 0) { SVEUnsizedLoadStoreContiguous(0b1, imm, zt, rn, true); } // SVE contiguous store (scalar plus scalar) template void st1b(ZRegister zt, PRegister pg, Register rn, Register rm) { SVEContiguousLoadStore(1, 0, 0b0000 | FEXCore::ToUnderlying(size), rm, pg, rn, zt); } template void st1h(ZRegister zt, PRegister pg, Register rn, Register rm) { static_assert(size != SubRegSize::i8Bit, "Invalid size"); SVEContiguousLoadStore(1, 0, 0b0100 | FEXCore::ToUnderlying(size), rm, pg, rn, zt); } template void st1w(ZRegister zt, PRegister pg, Register rn, Register rm) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 0 : size == SubRegSize::i64Bit ? 1 : -1; SVEContiguousLoadStore(1, 0, 0b1010 | ConvertedSize, rm, pg, rn, zt); } void st1d(ZRegister zt, PRegister pg, Register rn, Register rm) { SVEContiguousLoadStore(1, 0, 0b1111, rm, pg, rn, zt); } // SVE Memory - Non-temporal and Multi-register Store // SVE2 64-bit scatter non-temporal store (vector plus scalar) // XXX: // SVE contiguous non-temporal store (scalar plus scalar) // XXX: // SVE2 32-bit scatter non-temporal store (vector plus scalar) // XXX: // SVE store multiple structures (scalar plus scalar) void st2b(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i8Bit, 0b01, zt1, pg, rn, rm); } void st3b(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i8Bit, 0b10, zt1, pg, rn, rm); } void st4b(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i8Bit, 0b11, zt1, pg, rn, rm); } void st2h(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i16Bit, 0b01, zt1, pg, rn, rm); } void st3h(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i16Bit, 0b10, zt1, pg, rn, rm); } void st4h(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i16Bit, 0b11, zt1, pg, rn, rm); } void st2w(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i32Bit, 0b01, zt1, pg, rn, rm); } void st3w(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i32Bit, 0b10, zt1, pg, rn, rm); } void st4w(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i32Bit, 0b11, zt1, pg, rn, rm); } void st2d(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i64Bit, 0b01, zt1, pg, rn, rm); } void st3d(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i64Bit, 0b10, zt1, pg, rn, rm); } void st4d(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousLoadStoreMultipleScalar(true, SubRegSize::i64Bit, 0b11, zt1, pg, rn, rm); } // SVE Memory - Contiguous Store with Immediate Offset // SVE contiguous non-temporal store (scalar plus immediate) void stnt1b(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalStore(0b00, zt, pg, rn, Imm); } void stnt1h(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalStore(0b01, zt, pg, rn, Imm); } void stnt1w(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalStore(0b10, zt, pg, rn, Imm); } void stnt1d(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousNontemporalStore(0b11, zt, pg, rn, Imm); } // SVE store multiple structures (scalar plus immediate) void st2b(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, true, 0b00, Imm, zt1, pg, rn); } void st3b(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, true, 0b00, Imm, zt1, pg, rn); } void st4b(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, true, 0b00, Imm, zt1, pg, rn); } void st2h(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, true, 0b01, Imm, zt1, pg, rn); } void st3h(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, true, 0b01, Imm, zt1, pg, rn); } void st4h(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, true, 0b01, Imm, zt1, pg, rn); } void st2w(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, true, 0b10, Imm, zt1, pg, rn); } void st3w(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, true, 0b10, Imm, zt1, pg, rn); } void st4w(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, true, 0b10, Imm, zt1, pg, rn); } void st2d(ZRegister zt1, ZRegister zt2, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); SVEContiguousMultipleStructures(2, true, 0b11, Imm, zt1, pg, rn); } void st3d(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); SVEContiguousMultipleStructures(3, true, 0b11, Imm, zt1, pg, rn); } void st4d(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegister pg, Register rn, int32_t Imm = 0) { LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); SVEContiguousMultipleStructures(4, true, 0b11, Imm, zt1, pg, rn); } // SVE contiguous store (scalar plus immediate) template void st1b(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousLoadImm(true, 0b0000 | FEXCore::ToUnderlying(size), Imm, pg, rn, zt); } template void st1h(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { static_assert(size != SubRegSize::i8Bit, "Invalid size"); SVEContiguousLoadImm(true, 0b0100 | FEXCore::ToUnderlying(size), Imm, pg, rn, zt); } template void st1w(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { static_assert(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Invalid size"); constexpr uint32_t ConvertedSize = size == SubRegSize::i32Bit ? 0 : size == SubRegSize::i64Bit ? 1 : -1; SVEContiguousLoadImm(true, 0b1010 | ConvertedSize, Imm, pg, rn, zt); } void st1d(ZRegister zt, PRegister pg, Register rn, int32_t Imm = 0) { SVEContiguousLoadImm(true, 0b1111, Imm, pg, rn, zt); } private: // SVE encodings void SVEDupIndexed(SubRegSize size, ZRegister zn, ZRegister zd, uint32_t Index) { const auto size_bytes = 1U << FEXCore::ToUnderlying(size); const auto log2_size_bytes = FEXCore::ilog2(size_bytes); // We can index up to 512-bit registers with dup const auto max_index = (64U >> log2_size_bytes) - 1; LOGMAN_THROW_A_FMT(Index <= max_index, "dup index ({}) too large. Must be within [0, {}].", Index, max_index); // imm2:tsz make up a 7 bit wide field, with each increasing element size // restricting the range of those 7 bits (e.g. B: tsz=xxxx1, H: tsz=xxx10, // S: tsz=xx100. etc). So we can just use the log2 of the element size // to construct the overall immediate and form both imm2 and tsz. const auto imm7 = (Index << (log2_size_bytes + 1)) | (1U << log2_size_bytes); const auto imm2 = imm7 >> 5; const auto tsz = imm7 & 0b11111; uint32_t Instr = 0b0000'0101'0010'0000'0010'0000'0000'0000; Instr |= imm2 << 22; Instr |= tsz << 16; Instr |= Encode_rn(zn); Instr |= Encode_rd(zd); dc32(Instr); } void SVEAddSubImmediateUnpred(uint32_t opc, SubRegSize size, ZRegister zd, ZRegister zn, uint32_t imm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(zd == zn, "zd needs to equal zn"); const bool is_uint8_imm = (imm >> 8) == 0; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(is_uint8_imm, "Can't perform LSL #8 shift on 8-bit elements."); } uint32_t shift = 0; if (!is_uint8_imm) { const bool is_uint16_imm = (imm >> 16) == 0; LOGMAN_THROW_A_FMT(is_uint16_imm, "Immediate ({}) must be a 16-bit value within [256, 65280]", imm); LOGMAN_THROW_A_FMT((imm % 256) == 0, "Immediate ({}) must be a multiple of 256", imm); imm /= 256; shift = 1; } uint32_t Instr = 0b0010'0101'0010'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= shift << 13; Instr |= imm << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEMinMaxImmediateUnpred(uint32_t opc, SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(zd == zn, "zd needs to equal zn"); const bool is_signed = (opc & 1) == 0; if (is_signed) { LOGMAN_THROW_A_FMT(imm >= -128 && imm <= 127, "Invalid immediate ({}). Must be within [-127, 128]", imm); } else { LOGMAN_THROW_A_FMT(imm >= 0 && imm <= 255, "Invalid immediate ({}). Must be within [0, 255]", imm); } const auto imm8 = static_cast(imm) & 0xFF; uint32_t Instr = 0b0010'0101'0010'1000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= imm8 << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEMultiplyImmediateUnpred(uint32_t opc, SubRegSize size, ZRegister zd, ZRegister zn, int32_t imm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(zd == zn, "zd needs to equal zn"); LOGMAN_THROW_A_FMT(imm >= -128 && imm <= 127, "Invalid immediate ({}). Must be within [-127, 128]", imm); const auto imm8 = static_cast(imm) & 0xFF; uint32_t Instr = 0b0010'0101'0011'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= imm8 << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBroadcastImm(uint32_t opc, int32_t imm, SubRegSize size, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); const auto [new_imm, is_shift] = HandleSVESImm8Shift(size, imm); uint32_t Instr = 0b0010'0101'0011'1000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 17; Instr |= is_shift << 13; Instr |= (static_cast(new_imm) & 0xFF) << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBroadcastFloatImmPredicated(SubRegSize size, ZRegister zd, PRegister pg, float value) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Unsupported fcpy/fmov size"); uint32_t imm {}; if (size == SubRegSize::i16Bit) { LOGMAN_MSG_A_FMT("Unsupported"); FEX_UNREACHABLE; } else if (size == SubRegSize::i32Bit) { imm = FP32ToImm8(value); } else if (size == SubRegSize::i64Bit) { imm = FP64ToImm8(value); } uint32_t Instr = 0b0000'0101'0001'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= pg.Idx() << 16; Instr |= imm << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBroadcastFloatImmUnpredicated(uint32_t opc, uint32_t o2, uint32_t imm, SubRegSize size, ZRegister zd) { uint32_t Instr = 0b0010'0101'0011'1001'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 17; Instr |= o2 << 13; Instr |= imm << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBroadcastIntegerImmPredicated(uint32_t m, SubRegSize size, ZRegister zd, PRegister pg, int32_t imm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); const auto [new_imm, is_shift] = HandleSVESImm8Shift(size, imm); uint32_t Instr = 0b0000'0101'0001'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= pg.Idx() << 16; Instr |= m << 14; Instr |= is_shift << 13; Instr |= (static_cast(new_imm) & 0xFF) << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEAddressGeneration(SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm, SVEModType mod, uint32_t scale) { LOGMAN_THROW_A_FMT(scale <= 3, "Scale ({}) must be within [0, 3]", scale); uint32_t Instr = 0b0000'0100'0010'0000'1010'0000'0000'0000; switch (mod) { case SVEModType::MOD_UXTW: case SVEModType::MOD_SXTW: { LOGMAN_THROW_A_FMT(size == SubRegSize::i64Bit, "Unpacked ADR must be using 64-bit elements"); const auto is_unsigned = mod == SVEModType::MOD_UXTW; if (is_unsigned) { Instr |= 1U << 22; } break; } case SVEModType::MOD_NONE: case SVEModType::MOD_LSL: { if (mod == SVEModType::MOD_NONE) { LOGMAN_THROW_A_FMT(scale == 0, "Cannot scale packed ADR without a modifier"); } LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Packed ADR must be using 32-bit or 64-bit elements"); Instr |= FEXCore::ToUnderlying(size) << 22; break; } } Instr |= zm.Idx() << 16; Instr |= scale << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVESel(SubRegSize size, ZRegister zm, PRegister pv, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); uint32_t Instr = 0b0000'0101'0010'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= pv.Idx() << 10; Instr |= Encode_rn(zn); Instr |= Encode_rd(zd); dc32(Instr); } void SVEBitwiseShiftbyVector(uint32_t R, uint32_t L, uint32_t U, SubRegSize size, PRegister pg, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(zd == zn, "Dest needs to equal zn"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0001'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= R << 18; Instr |= L << 17; Instr |= U << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE integer add/subtract vectors (unpredicated) void SVEIntegerAddSubUnpredicated(uint32_t opc, SubRegSize size, ZRegister zm, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); uint32_t Instr = 0b0000'0100'0010'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE table lookup (three sources) void SVETableLookup(uint32_t op, SubRegSize size, ZRegister zm, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); uint32_t Instr = 0b0000'0101'0010'0000'0010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= op << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE permute vector elements void SVEPermute(uint32_t opc, SubRegSize size, ZRegister zm, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); uint32_t Instr = 0b0000'0101'0010'0000'0110'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE predicate logical operations void SVEPredicateLogical(uint32_t op, uint32_t S, uint32_t o2, uint32_t o3, PRegister pm, PRegister pg, PRegister pn, PRegister pd) { uint32_t Instr = 0b0010'0101'0000'0000'0100'0000'0000'0000; Instr |= op << 23; Instr |= S << 22; Instr |= pm.Idx() << 16; Instr |= pg.Idx() << 10; Instr |= o2 << 9; Instr |= pn.Idx() << 5; Instr |= o3 << 4; Instr |= pd.Idx(); dc32(Instr); } // SVE floating-point convert precision odd elements void SVEFloatConvertOdd(uint32_t opc, uint32_t opc2, PRegister pg, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0100'0000'1000'1010'0000'0000'0000; Instr |= opc << 22; Instr |= opc2 << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE2 floating-point pairwise operations void SVEFloatPairwiseArithmetic(uint32_t opc, SubRegSize size, PRegister pg, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zd needs to equal zn"); LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Invalid float size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0100'0001'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE floating-point arithmetic (unpredicated) void SVEFloatArithmeticUnpredicated(uint32_t opc, SubRegSize size, ZRegister zm, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Invalid float size"); uint32_t Instr = 0b0110'0101'0000'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE bitwise logical operations (predicated) void SVEBitwiseLogicalPredicated(uint32_t opc, SubRegSize size, PRegister pg, ZRegister zdn, ZRegister zm, ZRegister zd) { LOGMAN_THROW_A_FMT(size != ARMEmitter::SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(zd == zdn, "zd needs to equal zdn"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0001'1000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE constructive prefix (predicated) void SVEConstructivePrefixPredicated(uint32_t opc, uint32_t M, SubRegSize size, PRegister pg, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0001'0000'0010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 17; Instr |= M << 16; Instr |= pg.Idx() << 10; Instr |= Encode_rn(zn); Instr |= Encode_rd(zd); dc32(Instr); } // SVE bitwise unary operations (predicated) void SVEIntegerUnaryPredicated(uint32_t op0, uint32_t opc, SubRegSize size, PRegister pg, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0000'0000'1010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= op0 << 19; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE bitwise logical operations (unpredicated) void SVEBitwiseLogicalUnpredicated(uint32_t opc, ZRegister zm, ZRegister zn, ZRegister zd) { uint32_t Instr = 0b0000'0100'0010'0000'0011'0000'0000'0000; Instr |= opc << 22; Instr |= zm.Idx() << 16; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE Permute Vector - Unpredicated void SVEPermuteUnpredicated(SubRegSize size, uint32_t opc, ZRegister zdn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); uint32_t Instr = 0b0000'0101'0010'0000'0011'1000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= zm.Idx() << 5; Instr |= zdn.Idx(); dc32(Instr); } // SVE Permute Predicate void SVEPermutePredicate(SubRegSize size, uint32_t op1, uint32_t op2, uint32_t op3, PRegister pd, PRegister pn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); uint32_t Instr = 0b0000'0101'0010'0000'0100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= op1 << 16; Instr |= op2 << 9; Instr |= op3 << 4; Instr |= pn.Idx() << 5; Instr |= pd.Idx(); dc32(Instr); } // SVE Integer Misc - Unpredicated void SVEIntegerMiscUnpredicated(uint32_t op0, uint32_t opc, uint32_t opc2, ZRegister zd, ZRegister zn) { uint32_t Instr = 0b0000'0100'0010'0000'1011'0000'0000'0000; Instr |= opc2 << 22; Instr |= opc << 16; Instr |= op0 << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE floating-point arithmetic (predicated) void SVEFloatArithmeticPredicated(uint32_t opc, SubRegSize size, PRegister pg, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zn needs to equal zd"); LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Invalid float size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0101'0000'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVECharacterMatch(uint32_t opc, SubRegSize size, PRegister pd, PRegisterZero pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i8Bit || size == SubRegSize::i16Bit, "match/nmatch can only use 8-bit or 16-bit element sizes"); LOGMAN_THROW_A_FMT(pg <= PReg::p7.Zeroing(), "match/nmatch can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0100'0101'0010'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 4; Instr |= zm.Idx() << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= pd.Idx(); dc32(Instr); } void SVEFPRecursiveReduction(uint32_t opc, SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "FP reduction operation can only use 16/32/64-bit element sizes"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "FP reduction operation can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0101'0000'0000'0010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= vd.Idx(); dc32(Instr); } void SVEAddSubVectorsPredicated(uint32_t opc, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zd and zn must be the same register"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Add/Sub operation can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0000'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEIntegerMulDivVectorsPredicated(uint32_t b18, uint32_t opc, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zd and zn must be the same register"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Mul/Div operation can only use p0-p7 as a governing predicate"); // Division instruction if (b18 != 0) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Predicated divide only handles 32-bit or 64-bit " "elements"); } uint32_t Instr = 0b0000'0100'0001'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= b18 << 18; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEIntegerReductionOperation(uint32_t op, uint32_t opc, SubRegSize size, VRegister vd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size for reduction operation"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Integer reduction operation can only use p0-p7 as a governing predicate"); uint32_t Instr = op; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= vd.Idx(); dc32(Instr); } void SVEIntegerMultiplyAddSubPredicated(uint32_t op0, uint32_t opc, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0000'0000'0100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= op0 << 15; Instr |= opc << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEStackFrameOperation(uint32_t opc, XRegister rd, XRegister rn, int32_t imm) { LOGMAN_THROW_A_FMT(imm >= -32 && imm <= 31, "Stack frame operation immediate must be within -32 to 31"); uint32_t Instr = 0b0000'0100'0010'0000'0101'0000'0000'0000; Instr |= opc << 22; Instr |= rn.Idx() << 16; Instr |= (static_cast(imm) & 0b111111) << 5; Instr |= rd.Idx(); dc32(Instr); } void SVEBitwiseShiftByWideElementPredicated(SubRegSize size, uint32_t opc, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit && size != SubRegSize::i128Bit, "Can't use 64-bit or 128-bit element size"); LOGMAN_THROW_A_FMT(zd == zn, "zd and zn must be the same register"); LOGMAN_THROW_A_FMT(pg <= PReg::p7.Merging(), "Wide shift can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0001'1000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBitwiseShiftByWideElementsUnpredicated(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit && size != SubRegSize::i128Bit, "Can't use 64-bit or 128-bit element size"); uint32_t Instr = 0b0000'0100'0010'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 10; Instr |= zm.Idx() << 16; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEFPArithWithImmediate(uint32_t opc, SubRegSize size, ZRegister zd, PRegister pg, uint32_t i1) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); uint32_t Instr = 0b0110'0101'0001'1000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= i1 << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEFPConvertPrecision(SubRegSize to, SubRegSize from, ZRegister zd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(to != from, "to and from sizes cannot be the same."); LOGMAN_THROW_A_FMT(to != SubRegSize::i8Bit && to != SubRegSize::i128Bit && from != SubRegSize::i8Bit && from != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); // Encodings for the to and from sizes can get a little funky // depending on what is being converted to/from. const uint32_t op = [&] { switch (from) { case SubRegSize::i16Bit: { switch (to) { case SubRegSize::i32Bit: return 0x00810000U; case SubRegSize::i64Bit: return 0x00C10000U; default: return UINT32_MAX; } } case SubRegSize::i32Bit: { switch (to) { case SubRegSize::i16Bit: return 0x00800000U; case SubRegSize::i64Bit: return 0x00C30000U; default: return UINT32_MAX; } } case SubRegSize::i64Bit: { switch (to) { case SubRegSize::i16Bit: return 0x00C00000U; case SubRegSize::i32Bit: return 0x00C20000U; default: return UINT32_MAX; } } default: return UINT32_MAX; } }(); LOGMAN_THROW_A_FMT(op != UINT32_MAX, "Invalid conversion op value: {}", op); uint32_t Instr = 0b0110'0101'0000'1000'1010'0000'0000'0000; Instr |= op; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2IntegerAddSubNarrowHighPart(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit && size != SubRegSize::i128Bit, "Can't use 64-bit or 128-bit element size"); uint32_t Instr = 0b0100'0101'0010'0000'0110'0000'0000'0000; Instr |= (FEXCore::ToUnderlying(size) + 1) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2CryptoUnaryOperation(uint32_t op, ZRegister zdn, ZRegister zn) { LOGMAN_THROW_A_FMT(zdn == zn, "zdn and zn must be the same register"); uint32_t Instr = 0b0100'0101'0010'0000'1110'0000'0000'0000; Instr |= op << 10; Instr |= zdn.Idx(); dc32(Instr); } void SVE2CryptoDestructiveBinaryOperation(uint32_t op, uint32_t o2, ZRegister zdn, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zdn == zn, "zdn and zn must be the same register"); uint32_t Instr = 0b0100'0101'0010'0010'1110'0000'0000'0000; Instr |= op << 16; Instr |= o2 << 10; Instr |= zm.Idx() << 5; Instr |= zdn.Idx(); dc32(Instr); } void SVE2CryptoConstructiveBinaryOperation(uint32_t op, ZRegister zd, ZRegister zn, ZRegister zm) { uint32_t Instr = 0b0100'0101'0010'0000'1111'0000'0000'0000; Instr |= zm.Idx() << 16; Instr |= op << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2BitwisePermute(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); uint32_t Instr = 0b0100'0101'0000'0000'1011'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2BitwiseXorInterleaved(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); uint32_t Instr = 0b0100'0101'0000'0000'1001'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEIntegerMatrixMulAccumulate(uint32_t opc, ZRegister zda, ZRegister zn, ZRegister zm) { uint32_t Instr = 0b0100'0101'0000'0000'1001'1000'0000'0000; Instr |= opc << 22; Instr |= zm.Idx() << 16; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVE2IntegerAddSubInterleavedLong(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); uint32_t Instr = 0b0100'0101'0000'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2IntegerAbsDiffAndAccumulate(SubRegSize size, uint32_t opc, ZRegister zda, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); uint32_t Instr = 0b0100'0101'0000'0000'1111'1000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVE2IntegerAddSubLongWithCarry(SubRegSize size, uint32_t sizep1, uint32_t T, ZRegister zda, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Element size must be 32-bit or 64-bit"); const uint32_t NewSize = size == SubRegSize::i32Bit ? 0 : 1; uint32_t Instr = 0b0100'0101'0000'0000'1101'0000'0000'0000; Instr |= sizep1 << 23; Instr |= NewSize << 22; Instr |= zm.Idx() << 16; Instr |= T << 10; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVE2BitwiseShiftRightAndAccumulate(SubRegSize size, uint32_t opc, ZRegister zda, ZRegister zn, uint32_t shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Element size cannot be 128-bit"); const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, shift); uint32_t Instr = 0b0100'0101'0000'0000'1110'0000'0000'0000; Instr |= tszh << 22; Instr |= tszl_imm3 << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVE2BitwiseShiftAndInsert(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, uint32_t shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Element size cannot be 128-bit"); const bool IsLeftShift = opc != 0; const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, shift, IsLeftShift); uint32_t Instr = 0b0100'0101'0000'0000'1111'0000'0000'0000; Instr |= tszh << 22; Instr |= tszl_imm3 << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2BitwiseShiftLeftLong(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, uint32_t shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); // The size provided in is the size to expand to (e.g. 16-bit means a long shift // expanding from 8-bit) so we just need to subtract the size by 1 so that our // encoding helper will perform the proper encoding. const auto size_minus_1 = SubRegSize {FEXCore::ToUnderlying(size) - 1}; const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size_minus_1, shift, true); uint32_t Instr = 0b0100'0101'0000'0000'1010'0000'0000'0000; Instr |= tszh << 22; Instr |= tszl_imm3 << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2ComplexIntAdd(SubRegSize size, uint32_t opc, Rotation rot, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Complex add cannot use 128-bit element size"); LOGMAN_THROW_A_FMT(zd == zn, "zd and zn must be the same register"); LOGMAN_THROW_A_FMT(rot == Rotation::ROTATE_90 || rot == Rotation::ROTATE_270, "Rotation must be 90 or 270 degrees"); const uint32_t SanitizedRot = rot == Rotation::ROTATE_90 ? 0 : 1; uint32_t Instr = 0b0100'0101'0000'0000'1101'1000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= SanitizedRot << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2AbsDiffAccLong(SubRegSize size, uint32_t opc, ZRegister zda, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Cannot use 8-bit or 128-bit element size"); uint32_t Instr = 0b0100'0101'0000'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVEPermuteVectorUnpredicated(SubRegSize size, uint32_t opc, ZRegister zdn, VRegister vm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); uint32_t Instr = 0b0000'0101'0010'0000'0011'1000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= vm.Idx() << 5; Instr |= zdn.Idx(); dc32(Instr); } // SVE floating-point round to integral value void SVEFloatRoundIntegral(uint32_t opc, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn) { // opc = round mode // 0b000 - N - Neaest ties to even // 0b001 - P - Towards +inf // 0b010 - M - Towards -inf // 0b011 - Z - Towards zero // 0b100 - A - Nearest away from zero // 0b101 - Unallocated // 0b110 - X - Current signalling inexact // 0b111 - I - Current LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Unsupported size in {}", __func__); uint32_t Instr = 0b0110'0101'0000'0000'1010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE floating-point convert to integer void SVEFloatConvertToInt(SubRegSize dstsize, SubRegSize srcsize, uint32_t b19, uint32_t opc, uint32_t opc2, uint32_t U, PRegister pg, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(srcsize == SubRegSize::i16Bit || srcsize == SubRegSize::i32Bit || srcsize == SubRegSize::i64Bit, "Unsupported src size in {}", __func__); LOGMAN_THROW_A_FMT(dstsize == SubRegSize::i16Bit || dstsize == SubRegSize::i32Bit || dstsize == SubRegSize::i64Bit, "Unsupported dst size in {}", __func__); uint32_t Instr = 0b0110'0101'0001'0000'1010'0000'0000'0000; Instr |= opc << 22; Instr |= b19 << 19; Instr |= opc2 << 17; Instr |= U << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } // SVE integer convert to floating-point // We can implement this in terms of the floating-point to int version above, // since the only difference in encoding is setting bit 19 to 0. void SVEIntegerConvertToFloat(SubRegSize dstsize, SubRegSize srcsize, uint32_t opc, uint32_t opc2, uint32_t U, PRegister pg, ZRegister zn, ZRegister zd) { SVEFloatConvertToInt(dstsize, srcsize, 0, opc, opc2, U, pg, zn, zd); } // SVE Memory - 32-bit Gather and Unsized Contiguous // Note: This also handles 64-bit variants to keep overall handling code // compact and in the same place. void SVEGatherLoadScalarPlusVector(SubRegSize esize, SubRegSize msize, ZRegister zt, PRegisterZero pg, SVEMemOperand mem_op, bool is_unsigned, bool is_fault_first) { LOGMAN_THROW_A_FMT(esize == SubRegSize::i32Bit || esize == SubRegSize::i64Bit, "Gather load element size must be 32-bit or 64-bit"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); const auto& op_data = mem_op.MetaType.ScalarVectorType; const bool is_scaled = op_data.scale != 0; const auto msize_value = FEXCore::ToUnderlying(msize); LOGMAN_THROW_A_FMT(op_data.scale == 0 || op_data.scale == msize_value, "scale may only be 0 or {}", msize_value); uint32_t mod_value = FEXCore::ToUnderlying(op_data.mod); uint32_t Instr = 0b1000'0100'0000'0000'0000'0000'0000'0000; if (esize == SubRegSize::i64Bit) { Instr |= 1U << 30; const auto mod = op_data.mod; const bool is_lsl = mod == SVEModType::MOD_LSL; const bool is_none = mod == SVEModType::MOD_NONE; // LSL and no modifier encodings should be setting bit 22 to 1. if (is_lsl || is_none) { if (is_lsl) { LOGMAN_THROW_A_FMT(op_data.scale == msize_value, "mod type of LSL must have a scale of {}", msize_value); } else { LOGMAN_THROW_A_FMT(op_data.scale == 0, "mod type of none must have a scale of 0"); } Instr |= 1U << 15; mod_value = 1; } } else { LOGMAN_THROW_A_FMT(op_data.mod == SVEModType::MOD_UXTW || op_data.mod == SVEModType::MOD_SXTW, "mod type for 32-bit lane size may " "only be UXTW or SXTW"); } Instr |= FEXCore::ToUnderlying(msize) << 23; Instr |= static_cast(mod_value) << 22; Instr |= static_cast(is_scaled) << 21; Instr |= op_data.zm.Idx() << 16; Instr |= static_cast(is_unsigned) << 14; Instr |= static_cast(is_fault_first) << 13; Instr |= pg.Idx() << 10; Instr |= mem_op.rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVEScatterStoreScalarPlusVector(SubRegSize esize, SubRegSize msize, ZRegister zt, PRegister pg, SVEMemOperand mem_op) { LOGMAN_THROW_A_FMT(esize == SubRegSize::i32Bit || esize == SubRegSize::i64Bit, "Gather load element size must be 32-bit or 64-bit"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); const auto& op_data = mem_op.MetaType.ScalarVectorType; const bool is_scaled = op_data.scale != 0; const auto msize_value = FEXCore::ToUnderlying(msize); uint32_t mod_value = FEXCore::ToUnderlying(op_data.mod); LOGMAN_THROW_A_FMT(op_data.scale == 0 || op_data.scale == msize_value, "scale may only be 0 or {}", msize_value); uint32_t Instr = 0b1110'0100'0000'0000'1000'0000'0000'0000; if (esize == SubRegSize::i64Bit) { const auto mod = op_data.mod; const bool is_lsl = mod == SVEModType::MOD_LSL; const bool is_none = mod == SVEModType::MOD_NONE; if (is_lsl || is_none) { if (is_lsl) { LOGMAN_THROW_A_FMT(op_data.scale == msize_value, "mod type of LSL must have a scale of {}", msize_value); } else { LOGMAN_THROW_A_FMT(op_data.scale == 0, "mod type of none must have a scale of 0"); } if (is_lsl || is_scaled) { LOGMAN_THROW_A_FMT(msize != SubRegSize::i8Bit, "Cannot use 8-bit store elements with unpacked 32-bit scaled offset and " "64-bit scaled offset variants. Instructions not allocated."); } // 64-bit scaled/unscaled scatters need to set bit 13 Instr |= 1U << 13; mod_value = 0; } } else { if (is_scaled) { LOGMAN_THROW_A_FMT(msize != SubRegSize::i8Bit && msize != SubRegSize::i64Bit, "Cannot use 8-bit or 64-bit store elements with " "32-bit scaled offset variant. " "Instructions not allocated"); } else { LOGMAN_THROW_A_FMT(msize != SubRegSize::i64Bit, "Cannot use 64-bit store elements with 32-bit unscaled offset variant. " "Instruction not allocated."); } LOGMAN_THROW_A_FMT(op_data.mod == SVEModType::MOD_UXTW || op_data.mod == SVEModType::MOD_SXTW, "mod type for 32-bit lane size may " "only be UXTW or SXTW"); // 32-bit scatters need to set bit 22. Instr |= 1U << 22; } Instr |= msize_value << 23; Instr |= static_cast(is_scaled) << 21; Instr |= op_data.zm.Idx() << 16; Instr |= static_cast(mod_value) << 14; Instr |= pg.Idx() << 10; Instr |= mem_op.rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVEGatherScatterVectorPlusImm(SubRegSize esize, SubRegSize msize, ZRegister zt, PRegister pg, SVEMemOperand mem_op, bool is_store, bool is_unsigned, bool is_fault_first) { LOGMAN_THROW_A_FMT(esize == SubRegSize::i32Bit || esize == SubRegSize::i64Bit, "Gather load/store element size must be 32-bit or " "64-bit"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); const auto msize_value = FEXCore::ToUnderlying(msize); const auto msize_bytes = 1U << msize_value; const auto imm_limit = (32U << msize_value) - msize_bytes; const auto imm = mem_op.MetaType.VectorImmType.Imm; const auto imm_to_encode = imm >> msize_value; LOGMAN_THROW_A_FMT(imm <= imm_limit, "Immediate must be within [0, {}]", imm_limit); LOGMAN_THROW_A_FMT(imm == 0 || (imm % msize_bytes) == 0, "Immediate must be cleanly divisible by {}", msize_bytes); uint32_t Instr = 0b1000'0100'0000'0000'1000'0000'0000'0000; if (is_store) { Instr |= 0x60402000U; if (esize == SubRegSize::i32Bit) { Instr |= 1U << 21; } } else { Instr |= 0x00200000U; if (esize == SubRegSize::i64Bit) { Instr |= 1U << 30; } } Instr |= msize_value << 23; Instr |= imm_to_encode << 16; Instr |= static_cast(is_unsigned) << 14; Instr |= static_cast(is_fault_first) << 13; Instr |= pg.Idx() << 10; Instr |= mem_op.rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVEGatherLoadVectorPlusImm(SubRegSize esize, SubRegSize msize, ZRegister zt, PRegisterZero pg, SVEMemOperand mem_op, bool is_unsigned, bool is_fault_first) { SVEGatherScatterVectorPlusImm(esize, msize, zt, pg, mem_op, false, is_unsigned, is_fault_first); } void SVEScatterStoreVectorPlusImm(SubRegSize esize, SubRegSize msize, ZRegister zt, PRegister pg, SVEMemOperand mem_op) { SVEGatherScatterVectorPlusImm(esize, msize, zt, pg, mem_op, true, false, true); } void SVEUnsizedLoadStoreContiguous(uint32_t op2, int32_t imm, ZRegister zt, Register rn, bool is_store) { LOGMAN_THROW_A_FMT(imm >= -256 && imm <= 255, "Immediate offset ({}) too large. Must be within [-256, 255].", imm); const auto imm9 = static_cast(imm) & 0b1'1111'1111; uint32_t Instr = 0b1000'0101'1000'0000'0000'0000'0000'0000; if (is_store) { Instr |= 0x60000000U; } Instr |= (imm9 >> 3) << 16; Instr |= op2 << 14; Instr |= (imm9 & 0b111) << 10; Instr |= rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } // SVE load/store multiple structures (scalar plus immediate) void SVEContiguousMultipleStructures(int32_t num_regs, bool is_store, uint32_t msz, int32_t imm, ZRegister zt, PRegister pg, Register rn) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT((imm % num_regs) == 0, "Offset must be a multiple of {}", num_regs); const auto min_offset = -8 * num_regs; const auto max_offset = 7 * num_regs; LOGMAN_THROW_A_FMT(imm >= min_offset && imm <= max_offset, "Invalid load/store offset ({}). Offset must be a multiple of {} and be within [{}, {}]", imm, num_regs, min_offset, max_offset); const auto imm4 = static_cast(imm / num_regs) & 0xF; const auto opc = static_cast(num_regs - 1); uint32_t Instr = 0b1010'0100'0000'0000'1110'0000'0000'0000; Instr |= msz << 23; Instr |= opc << 21; Instr |= imm4 << 16; Instr |= pg.Idx() << 10; Instr |= Encode_rn(rn); Instr |= zt.Idx(); if (is_store) { Instr |= 0x40100000U; } dc32(Instr); } // SVE contiguous non-temporal load (scalar plus immediate) void SVEContiguousNontemporalLoad(uint32_t msz, ZRegister zt, PRegister pg, Register rn, int32_t imm) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(imm >= -8 && imm <= 7, "Invalid loadstore offset ({}). Must be between [-8, 7]", imm); const auto imm4 = static_cast(imm) & 0xF; uint32_t Instr = 0b1010'0100'0000'0000'1110'0000'0000'0000; Instr |= msz << 23; Instr |= imm4 << 16; Instr |= pg.Idx() << 10; Instr |= Encode_rn(rn); Instr |= zt.Idx(); dc32(Instr); } // SVE contiguous non-temporal store (scalar plus immediate) void SVEContiguousNontemporalStore(uint32_t msz, ZRegister zt, PRegister pg, Register rn, int32_t imm) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(imm >= -8 && imm <= 7, "Invalid loadstore offset ({}). Must be between [-8, 7]", imm); const auto imm4 = static_cast(imm) & 0xF; uint32_t Instr = 0b1110'0100'0001'0000'1110'0000'0000'0000; Instr |= msz << 23; Instr |= imm4 << 16; Instr |= pg.Idx() << 10; Instr |= Encode_rn(rn); Instr |= zt.Idx(); dc32(Instr); } void SVEContiguousLoadImm(bool is_store, uint32_t dtype, int32_t imm, PRegister pg, Register rn, ZRegister zt) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(imm >= -8 && imm <= 7, "Invalid loadstore offset ({}). Must be between [-8, 7]", imm); const auto imm4 = static_cast(imm) & 0xF; uint32_t Instr = 0b1010'0100'0000'0000'1010'0000'0000'0000; Instr |= dtype << 21; Instr |= imm4 << 16; Instr |= pg.Idx() << 10; Instr |= Encode_rn(rn); Instr |= zt.Idx(); if (is_store) { Instr |= 0x40004000U; } dc32(Instr); } // zt.b, pg/z, xn, xm void SVEContiguousLoadStore(uint32_t b30, uint32_t b13, uint32_t dtype, Register rm, PRegister pg, Register rn, ZRegister zt) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b1010'0100'0000'0000'0100'0000'0000'0000; Instr |= b30 << 30; Instr |= dtype << 21; Instr |= Encode_rm(rm); Instr |= b13 << 13; Instr |= pg.Idx() << 10; Instr |= Encode_rn(rn); Instr |= zt.Idx(); dc32(Instr); } void SVEContiguousLoadStoreMultipleScalar(bool is_store, SubRegSize msz, uint32_t opc, ZRegister zt, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(rm != Reg::rsp, "rm cannot be the stack pointer"); uint32_t Instr = 0b1010'0100'0000'0000'0000'0000'0000'0000; if (is_store) { Instr |= 0x40006000U; } else { Instr |= 0x0000C000U; } Instr |= FEXCore::ToUnderlying(msz) << 23; Instr |= opc << 21; Instr |= rm.Idx() << 16; Instr |= pg.Idx() << 10; Instr |= rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVELoadBroadcastQuadScalarPlusImm(uint32_t msz, uint32_t ssz, ZRegister zt, PRegister pg, Register rn, int imm) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); const auto esize = static_cast(16 << ssz); const auto max_imm = (esize << 3) - esize; const auto min_imm = -(max_imm + esize); LOGMAN_THROW_A_FMT((imm % esize) == 0, "imm ({}) must be a multiple of {}", imm, esize); LOGMAN_THROW_A_FMT(imm >= min_imm && imm <= max_imm, "imm ({}) must be within [{}, {}]", imm, min_imm, max_imm); const auto sanitized_imm = static_cast(imm / esize) & 0b1111; uint32_t Instr = 0b1010'0100'0000'0000'0010'0000'0000'0000; Instr |= msz << 23; Instr |= ssz << 21; Instr |= sanitized_imm << 16; Instr |= pg.Idx() << 10; Instr |= rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVELoadBroadcastQuadScalarPlusScalar(uint32_t msz, uint32_t ssz, ZRegister zt, PRegister pg, Register rn, Register rm) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(rm != Reg::rsp, "rm may not be the stack pointer"); uint32_t Instr = 0b1010'0100'0000'0000'0000'0000'0000'0000; Instr |= msz << 23; Instr |= ssz << 21; Instr |= rm.Idx() << 16; Instr |= pg.Idx() << 10; Instr |= rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVELoadAndBroadcastElement(bool is_signed, SubRegSize esize, SubRegSize msize, ZRegister zt, PRegister pg, Register rn, uint32_t imm) { LOGMAN_THROW_A_FMT(esize != SubRegSize::i128Bit, "Cannot use 128-bit elements."); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); if (is_signed) { // The element size needs to be larger than memory size, otherwise you tell // me how we're gonna sign extend this bad boy in memory. LOGMAN_THROW_A_FMT(esize > msize, "Signed broadcast element size must be greater than memory size."); } const auto esize_value = FEXCore::ToUnderlying(esize); const auto msize_value = FEXCore::ToUnderlying(msize); const auto data_size_bytes = 1U << msize_value; const auto max_imm = (64U << msize_value) - data_size_bytes; LOGMAN_THROW_A_FMT((imm % data_size_bytes) == 0 && imm <= max_imm, "imm must be a multiple of {} and be within [0, {}]", data_size_bytes, max_imm); const auto sanitized_imm = imm / data_size_bytes; auto dtypeh = msize_value; auto dtypel = esize_value; if (is_signed) { // Signed forms of the broadcast instructions are encoded in such a way // that msize will always be greater than esize, which, conveniently, // works out by just XORing the would-be unsigned dtype values by 3. dtypeh ^= 0b11; dtypel ^= 0b11; } // Guards against bogus combinations of element size and memory size values // being passed in. Unsigned variants will always have dtypeh be less than // or equal to dtypel. The only time this isn't the case is with signed variants. LOGMAN_THROW_A_FMT(is_signed == (dtypeh > dtypel), "Invalid element size used with load broadcast instruction " "(esize: {}, msize: {})", esize_value, msize_value); uint32_t Instr = 0b1000'0100'0100'0000'1000'0000'0000'0000; Instr |= dtypeh << 23; Instr |= sanitized_imm << 16; Instr |= dtypel << 13; Instr |= pg.Idx() << 10; Instr |= rn.Idx() << 5; Instr |= zt.Idx(); dc32(Instr); } void SVEIndexGeneration(uint32_t op, SubRegSize size, ZRegister zd, int32_t imm5, int32_t imm5b) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "INDEX cannot use 128-bit element sizes"); uint32_t Instr = 0b0000'0100'0010'0000'0100'0000'0000'0000; Instr |= op << 10; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= (static_cast(imm5b) & 0b11111) << 16; Instr |= (static_cast(imm5) & 0b11111) << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEIntegerCompareImm(uint32_t lt, uint32_t ne, uint32_t imm7, SubRegSize size, PRegister pg, ZRegister zn, PRegister pd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(imm7 < 128, "Invalid imm ({}). Must be within [0, 128]", imm7); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0010'0100'0010'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= imm7 << 14; Instr |= lt << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= ne << 4; Instr |= pd.Idx(); dc32(Instr); } void SVEIntegerCompareSignedImm(uint32_t op, uint32_t o2, uint32_t ne, int32_t imm5, SubRegSize size, PRegister pg, ZRegister zn, PRegister pd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(imm5 >= -16 && imm5 <= 15, "Invalid imm ({}). Must be within [-16, 15].", imm5); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0010'0101'0000'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= (static_cast(imm5) & 0b1'1111) << 16; Instr |= op << 15; Instr |= o2 << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= ne << 4; Instr |= pd.Idx(); dc32(Instr); } void SVEFloatCompareVector(uint32_t op, uint32_t o2, uint32_t o3, SubRegSize size, ZRegister zm, PRegister pg, ZRegister zn, PRegister pd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Can't use 8-bit size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0101'0000'0000'0100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= op << 15; Instr |= o2 << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= o3 << 4; Instr |= pd.Idx(); dc32(Instr); } void SVEIntegerMinMaxDifferencePredicated(uint32_t opc, uint32_t U, SubRegSize size, PRegister pg, ZRegister zdn, ZRegister zm, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(zd == zdn, "zd needs to equal zdn"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0100'0000'1000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 17; Instr |= U << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBitWiseShiftImmediatePred(SubRegSize size, uint32_t opc, uint32_t L, uint32_t U, PRegister pg, ZRegister zd, ZRegister zdn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(zd == zdn, "zd needs to equal zdn"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); const bool IsLeftShift = L != 0; const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, Shift, IsLeftShift); uint32_t Instr = 0b0000'0100'0000'0000'1000'0000'0000'0000; Instr |= tszh << 22; Instr |= opc << 18; Instr |= L << 17; Instr |= U << 16; Instr |= pg.Idx() << 10; Instr |= tszl_imm3 << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEBitWiseShiftImmediateUnpred(SubRegSize size, uint32_t opc, ZRegister zd, ZRegister zn, uint32_t Shift) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); const bool IsLeftShift = opc == 0b11; const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, Shift, IsLeftShift); uint32_t Instr = 0b0000'0100'0010'0000'1001'0000'0000'0000; Instr |= tszh << 22; Instr |= tszl_imm3 << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2BitwiseTernary(uint32_t opc, uint32_t o2, ZRegister zm, ZRegister zk, ZRegister zd, ZRegister zdn) { LOGMAN_THROW_A_FMT(zd == zdn, "zd needs to equal zdn"); uint32_t Instr = 0b0000'0100'0010'0000'0011'1000'0000'0000; Instr |= opc << 22; Instr |= zm.Idx() << 16; Instr |= o2 << 10; Instr |= zk.Idx() << 5; Instr |= zdn.Idx(); dc32(Instr); } void SVEPermuteVector(uint32_t op0, ARMEmitter::ZRegister zd, ARMEmitter::ZRegister zm, uint32_t Imm) { constexpr uint32_t Op = 0b0000'0101'0010'0000'000 << 13; uint32_t Instr = Op; Instr |= op0 << 22; Instr |= (Imm >> 3) << 16; Instr |= (Imm & 0b111) << 10; Instr |= zm.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEIntegerCompareVector(uint32_t op, uint32_t o2, uint32_t ne, SubRegSize size, ZRegister zm, PRegister pg, ZRegister zn, PRegister pd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit element size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); constexpr uint32_t Op = 0b0010'0100'0000'0000'000 << 13; uint32_t Instr = Op; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= op << 15; Instr |= o2 << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= ne << 4; Instr |= pd.Idx(); dc32(Instr); } void SVEIntegerCompareVectorWide(uint32_t op, uint32_t o2, uint32_t ne, SubRegSize size, PRegister pd, PRegister pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i64Bit, "Can't use 64-bit element size"); SVEIntegerCompareVector(op, o2, ne, size, zm, pg, zn, pd); } void SVE2SaturatingExtractNarrow(SubRegSize size, uint32_t opc, uint32_t T, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit && size != SubRegSize::i64Bit, "Can't use 64/128-bit size"); // While not necessarily a left shift, we can piggyback off its // encoding behavior to encode the tszh and tszl bits. const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, 0, true); uint32_t Instr = 0b0100'0101'0010'0000'0100'0000'0000'0000; Instr |= tszh << 22; Instr |= tszl_imm3 << 16; Instr |= opc << 11; Instr |= T << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2BitwiseShiftRightNarrow(SubRegSize size, uint32_t shift, uint32_t opc, uint32_t U, uint32_t R, uint32_t T, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit && size != SubRegSize::i64Bit, "Can't use 64/128-bit element size"); const auto [tszh, tszl_imm3] = EncodeSVEShiftImmediate(size, shift); uint32_t Instr = 0b0100'0101'0010'0000'0000'0000'0000'0000; Instr |= tszh << 22; Instr |= tszl_imm3 << 16; Instr |= opc << 13; Instr |= U << 12; Instr |= R << 11; Instr |= T << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEFloatUnary(uint32_t opc, SubRegSize size, PRegister pg, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "Unsupported size in {}", __func__); uint32_t Instr = 0b0110'0101'0000'1100'1010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2IntegerMultiplyVectors(uint32_t opc, SubRegSize size, ZRegister zm, ZRegister zn, ZRegister zd) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); constexpr uint32_t Op = 0b0000'0100'0010'0000'0110 << 12; uint32_t Instr = Op; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEPermuteVectorPredicated(uint32_t opc1, uint32_t opc2, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0000'0101'0010'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc1 << 16; Instr |= opc2 << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEPropagateBreak(uint32_t opc, uint32_t op2, uint32_t op3, PRegister pd, PRegister pg, PRegister pn, PRegister pm) { uint32_t Instr = 0b0010'0101'0000'0000'0000'0000'0000'0000; Instr |= opc << 20; Instr |= op2 << 14; Instr |= op3 << 4; Instr |= pm.Idx() << 16; Instr |= pg.Idx() << 10; Instr |= pn.Idx() << 5; Instr |= pd.Idx(); dc32(Instr); } void SVEPredicateMisc(uint32_t op0, uint32_t op2, uint32_t op3, SubRegSize size, PRegister pd) { // Note: op2 combines op1 like [op1:op2], since they're adjacent. LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); uint32_t Instr = 0b0010'0101'0001'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= op0 << 16; Instr |= op2 << 9; Instr |= op3 << 5; Instr |= pd.Idx(); dc32(Instr); } void SVEIntCompareScalar(uint32_t op1, uint32_t b4, uint32_t op2, SubRegSize size, Register rn, Register rm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Can't use 128-bit size"); uint32_t Instr = 0b0010'0101'0010'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= rm.Idx() << 16; Instr |= op1 << 10; Instr |= rn.Idx() << 5; Instr |= b4 << 4; Instr |= op2; dc32(Instr); } void SVEWriteFFR(uint32_t op0, uint32_t op1, uint32_t op2, uint32_t op3, uint32_t op4) { uint32_t Instr = 0b0010'0101'0010'1000'1001'0000'0000'0000; Instr |= op0 << 18; Instr |= op1 << 16; Instr |= op2 << 9; Instr |= op3 << 5; Instr |= op4; dc32(Instr); } void SVEFPUnaryOpsUnpredicated(uint32_t opc, SubRegSize size, ZRegister zd, ZRegister zn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); uint32_t Instr = 0b0110'0101'0000'1000'0011'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEFPSerialReductionPredicated(uint32_t opc, SubRegSize size, VRegister vd, PRegister pg, VRegister vn, ZRegister zm) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); LOGMAN_THROW_A_FMT(vd == vn, "vn must be the same as vd"); uint32_t Instr = 0b0110'0101'0001'1000'0010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= zm.Idx() << 5; Instr |= vd.Idx(); dc32(Instr); } void SVEFPCompareWithZero(uint32_t eqlt, uint32_t ne, SubRegSize size, PRegister pd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0101'0001'0000'0010'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= eqlt << 16; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= ne << 4; Instr |= pd.Idx(); dc32(Instr); } void SVEFPMultiplyAdd(uint32_t opc, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn, ZRegister zm) { // NOTE: opc also includes the op0 bit (bit 15) like op0:opc, since the fields are adjacent LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0110'0101'0010'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= opc << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEFPMultiplyAddIndexed(uint32_t op, SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { LOGMAN_THROW_A_FMT(IsStandardFloatSize(size), "SubRegSize must be 16-bit, 32-bit, or 64-bit"); LOGMAN_THROW_A_FMT((size <= SubRegSize::i32Bit && zm <= ZReg::z7) || (size == SubRegSize::i64Bit && zm <= ZReg::z15), "16-bit and 32-bit indexed variants may only use Zm between z0-z7\n" "64-bit variants may only use Zm between z0-z15"); const auto Underlying = FEXCore::ToUnderlying(size); const uint32_t IndexMax = (16 / (1U << Underlying)) - 1; LOGMAN_THROW_A_FMT(index <= IndexMax, "Index must be within 0-{}", IndexMax); // Can be bit 20 or 19 depending on whether or not the element size is 64-bit. const auto IndexShift = 19 + static_cast(size == SubRegSize::i64Bit); uint32_t Instr = 0b0110'0100'0010'0000'0000'0000'0000'0000; Instr |= Underlying << 22; Instr |= (index & 0b1000) << 19; Instr |= (index & 0b0111) << IndexShift; Instr |= zm.Idx() << 16; Instr |= op << 10; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVEFPMultiplyAddLongIndexed(uint32_t o2, uint32_t op, uint32_t T, SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm, uint32_t index) { LOGMAN_THROW_A_FMT(dstsize == SubRegSize::i32Bit, "Destination size must be 32-bit."); LOGMAN_THROW_A_FMT(index <= 7, "Index ({}) must be within [0, 7]", index); LOGMAN_THROW_A_FMT(zm <= ZReg::z7, "zm (z{}) must be within [z0, z7]", zm.Idx()); uint32_t Inst = 0b0110'0100'1010'0000'0100'0000'0000'0000; Inst |= o2 << 22; Inst |= (index & 0b110) << 18; Inst |= zm.Idx() << 16; Inst |= op << 13; Inst |= (index & 0b001) << 11; Inst |= T << 10; Inst |= zn.Idx() << 5; Inst |= zda.Idx(); dc32(Inst); } void SVEFPMultiplyAddLong(uint32_t o2, uint32_t op, uint32_t T, SubRegSize dstsize, ZRegister zda, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(dstsize == SubRegSize::i32Bit, "Destination size must be 32-bit."); uint32_t Instr = 0b0110'0100'1010'0000'1000'0000'0000'0000; Instr |= o2 << 22; Instr |= zm.Idx() << 16; Instr |= op << 13; Instr |= T << 10; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVEFPMatrixMultiplyAccumulate(SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "SubRegSize must be 32-bit or 64-bit"); uint32_t Instr = 0b0110'0100'0010'0000'1110'0100'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= zn.Idx() << 5; Instr |= zda.Idx(); dc32(Instr); } void SVEPredicateCount(uint32_t opc, SubRegSize size, XRegister rd, PRegister pg, PRegister pn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); uint32_t Instr = 0b0010'0101'0010'0000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= opc << 16; Instr |= pg.Idx() << 10; Instr |= pn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } void SVEElementCount(uint32_t b20, uint32_t op1, SubRegSize size, ZRegister zdn, PredicatePattern pattern, uint32_t imm4) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); LOGMAN_THROW_A_FMT(imm4 >= 1 && imm4 <= 16, "Immediate must be between 1-16 inclusive"); uint32_t Instr = 0b0000'0100'0010'0000'1100'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= b20 << 20; Instr |= (imm4 - 1) << 16; Instr |= op1 << 10; Instr |= FEXCore::ToUnderlying(pattern) << 5; Instr |= zdn.Idx(); dc32(Instr); } void SVEIncDecPredicateCountScalar(uint32_t op0, uint32_t op1, uint32_t opc, uint32_t b16, SubRegSize size, Register rdn, PRegister pm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); uint32_t Instr = 0b0010'0101'0010'1000'1000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= op0 << 18; Instr |= b16 << 16; Instr |= op1 << 11; Instr |= opc << 9; Instr |= pm.Idx() << 5; Instr |= rdn.Idx(); dc32(Instr); } void SVEIncDecPredicateCountVector(uint32_t op0, uint32_t op1, uint32_t opc, uint32_t b16, SubRegSize size, ZRegister zdn, PRegister pm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Cannot use 8-bit element size"); SVEIncDecPredicateCountScalar(op0, op1, opc, b16, size, Register {zdn.Idx()}, pm); } void SVE2IntegerPredicated(uint32_t op0, uint32_t op1, SubRegSize size, ZRegister zd, PRegister pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit size"); LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); uint32_t Instr = 0b0100'0100'0000'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= op0 << 16; // Intentionally 16 instead of 17 to handle bit range nicer Instr |= op1 << 13; Instr |= pg.Idx() << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2IntegerPairwiseAddAccumulateLong(uint32_t U, SubRegSize size, ZRegister zda, PRegisterMerge pg, ZRegister zn) { LOGMAN_THROW_A_FMT(size == SubRegSize::i16Bit || size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "SubRegSize must be 16-bit, " "32-bit, or 64-bit"); SVE2IntegerPredicated((0b0010 << 1) | U, 0b101, size, zda, pg, zn); } void SVE2IntegerUnaryOpsPredicated(uint32_t op0, SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn) { SVE2IntegerPredicated(op0, 0b101, size, zd, pg, zn); } void SVE2SaturatingRoundingBitwiseShiftLeft(uint32_t op0, SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zn needs to equal zd"); SVE2IntegerPredicated(op0, 0b100, size, zd, pg, zm); } void SVE2IntegerHalvingPredicated(uint32_t RSU, SubRegSize size, PRegister pg, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zn needs to equal zd"); SVE2IntegerPredicated((0b10 << 3) | RSU, 0b100, size, zd, pg, zm); } void SVEIntegerPairwiseArithmetic(uint32_t opc, uint32_t U, SubRegSize size, PRegister pg, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zn needs to equal zd"); SVE2IntegerPredicated((0b10 << 3) | (opc << 1) | U, 0b101, size, zd, pg, zm); } void SVE2IntegerSaturatingAddSub(uint32_t opc, SubRegSize size, ZRegister zd, PRegisterMerge pg, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(zd == zn, "zn needs to equal zd"); SVE2IntegerPredicated((0b11 << 3) | opc, 0b100, size, zd, pg, zm); } void SVEIntegerMultiplyAddUnpredicated(uint32_t op0, SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "Cannot use 128-bit element size"); uint32_t Instr = 0b0100'0100'0000'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= op0 << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVEIntegerDotProduct(uint32_t op, SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, Rotation rot) { LOGMAN_THROW_A_FMT(size == SubRegSize::i32Bit || size == SubRegSize::i64Bit, "Dot product must only use 32-bit or 64-bit element " "sizes"); SVEIntegerComplexMulAdd(op, size, zda, zn, zm, rot); } void SVEIntegerComplexMulAdd(uint32_t op, SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm, Rotation rot) { const auto op0 = op << 2 | FEXCore::ToUnderlying(rot); SVEIntegerMultiplyAddUnpredicated(op0, size, zda, zn, zm); } void SVE2SaturatingMulAddInterleaved(uint32_t op0, SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit, "Element size may only be 16-bit, 32-bit, or 64-bit"); SVEIntegerMultiplyAddUnpredicated(op0, size, zda, zn, zm); } void SVE2IntegerMulAddLong(uint32_t op0, SubRegSize size, ZRegister zda, ZRegister zn, ZRegister zm) { SVE2SaturatingMulAddInterleaved(op0, size, zda, zn, zm); } void SVE2WideningIntegerArithmetic(uint32_t op, uint32_t SUT, SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { uint32_t Instr = 0b0100'0101'0000'0000'0000'0000'0000'0000; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= zm.Idx() << 16; Instr |= op << 13; Instr |= SUT << 10; Instr |= zn.Idx() << 5; Instr |= zd.Idx(); dc32(Instr); } void SVE2IntegerAddSubLong(uint32_t op, uint32_t SUT, SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); SVE2WideningIntegerArithmetic(op, SUT, size, zd, zn, zm); } void SVE2IntegerAddSubWide(uint32_t SUT, SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); SVE2WideningIntegerArithmetic(0b10, SUT, size, zd, zn, zm); } void SVE2IntegerMultiplyLong(uint32_t SUT, SubRegSize size, ZRegister zd, ZRegister zn, ZRegister zm) { // PMULLB and PMULLT support the use of 128-bit element sizes (with the SVE2PMULL128 extension) if (SUT == 0b010 || SUT == 0b011) { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i32Bit, "Can't use 8-bit or 32-bit element size"); // 128-bit variant is encoded as if it were 8-bit (0b00) if (size == SubRegSize::i128Bit) { size = SubRegSize::i8Bit; } } else { LOGMAN_THROW_A_FMT(size != SubRegSize::i8Bit && size != SubRegSize::i128Bit, "Can't use 8-bit or 128-bit element size"); } SVE2WideningIntegerArithmetic(0b11, SUT, size, zd, zn, zm); } struct SVEEncodedImmShift { uint32_t tszh; uint32_t tszl_imm3; }; // Helper for encoding shift immediates that make use of the tszh:tszl and imm3 field. static constexpr SVEEncodedImmShift EncodeSVEShiftImmediate(SubRegSize size, uint32_t shift, bool is_left_shift = false) { const uint32_t element_size = SubRegSizeInBits(size); if (is_left_shift) { LOGMAN_THROW_A_FMT(shift < element_size, "Invalid left shift value ({}). Must be within [0, {}]", shift, element_size - 1); } else { LOGMAN_THROW_A_FMT(shift > 0 && shift <= element_size, "Invalid right shift value ({}). Must be within [1, {}]", shift, element_size); } // Both left and right shifts encodes their shift as if it were // expanding the tszh:tszl (tsize) bits to the the left in order to accomodate // larger shift values. e.g. (B: tsize=0b0001, H: tsize=0b001x, etc) // // The difference is in how they're encoded. Left shifts are trivial and // encode as element_size_in_bits + shift, which works nicely since // the size will just occupy the next bit in tsize leaving the previous // one for encoding larger shifts. // // Right shifts instead encode it like a subtraction. e.g. A shift of 1 // would encode like (S: tsize=0b0111 imm3=0b111, where 64 - 1 = 63, etc). // so the more lower in value the bits are set, the larger the shift. const uint32_t encoded_shift = is_left_shift ? element_size + shift : (2 * element_size) - shift; return { .tszh = encoded_shift >> 5, .tszl_imm3 = encoded_shift & 0b11111, }; } // Alias that returns the equivalently sized unsigned type for a floating-point type T. template requires (std::is_same_v || std::is_same_v) using FloatToEquivalentUInt = std::conditional_t, uint32_t, uint64_t>; #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED // Determines if a floating-point value is capable of being converted // into an 8-bit immediate. See pseudocode definition of VFPExpandImm // in ARM A-profile reference manual for a general overview of how this was derived. template requires (std::is_same_v || std::is_same_v) [[nodiscard]] static bool IsValidFPValueForImm8(T value) { const uint64_t bits = std::bit_cast>(value); const uint64_t datasize_idx = FEXCore::ilog2(sizeof(T)) - 1; static constexpr std::array mantissa_masks { 0x00000000'0000003FULL, // half (bits [5:0]) 0x00000000'0007FFFFULL, // single (bits [18:0]) 0x0000FFFF'FFFFFFFFULL, // double (bits [47:0]) }; const auto mantissa_mask = mantissa_masks[datasize_idx]; // Relevant mantissa bits must be set to zero if ((bits & mantissa_mask) != 0) { return false; } static constexpr std::array exponent_masks { 0x00000000'00003000ULL, // half (bits [13:12]) 0x00000000'3E000000ULL, // single (bits [29:25]) 0x3FC00000'00000000ULL, // double (bits [61:54]) }; const auto exponent_mask = exponent_masks[datasize_idx]; const auto masked_exponent = bits & exponent_mask; // Relevant exponent bits must either be all set or all cleared. if (masked_exponent != 0 && masked_exponent != exponent_mask) { return false; } // The two bits before the sign bit must be inverses of each other. const auto datasize = 8ULL * sizeof(T); const auto inverse = bits ^ (bits << 1); const auto inverse_mask = 1ULL << (datasize - 2); if ((inverse & inverse_mask) == 0) { return false; } return true; } #endif protected: static uint32_t FP32ToImm8(float value) { #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_THROW_A_FMT(IsValidFPValueForImm8(value), "Value ({}) cannot be encoded into an 8-bit immediate", value); #endif const auto bits = std::bit_cast(value); const auto sign = (bits & 0x80000000) >> 24; const auto expb2 = (bits & 0x20000000) >> 23; const auto b5_to_0 = (bits >> 19) & 0x3F; return sign | expb2 | b5_to_0; } static uint32_t FP64ToImm8(double value) { #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_THROW_A_FMT(IsValidFPValueForImm8(value), "Value ({}) cannot be encoded into an 8-bit immediate", value); #endif const auto bits = std::bit_cast(value); const auto sign = (bits & 0x80000000'00000000) >> 56; const auto expb2 = (bits & 0x20000000'00000000) >> 55; const auto b5_to_0 = (bits >> 48) & 0x3F; return static_cast(sign | expb2 | b5_to_0); } private: // Handling for signed 8-bit immediate shifts (e.g. in cpy/dup) struct HandledSImm8Shift { int32_t imm; uint32_t is_shift; }; static constexpr HandledSImm8Shift HandleSVESImm8Shift(SubRegSize size, int32_t imm) { const int32_t imm8_limit = 128; const bool is_int8_imm = -imm8_limit <= imm && imm < imm8_limit; if (size == SubRegSize::i8Bit) { LOGMAN_THROW_A_FMT(is_int8_imm, "Can't perform LSL #8 shift on 8-bit elements."); } uint32_t shift = 0; if (!is_int8_imm) { const int32_t imm16_limit = 32768; const bool is_int16_imm = -imm16_limit <= imm && imm < imm16_limit; LOGMAN_THROW_A_FMT(is_int16_imm, "Immediate ({}) must be a 16-bit value within [-32768, 32512]", imm); LOGMAN_THROW_A_FMT((imm % 256) == 0, "Immediate ({}) must be a multiple of 256", imm); imm /= 256; shift = 1; } return { .imm = imm, .is_shift = shift, }; } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/ScalarOps.inl ================================================ // SPDX-License-Identifier: MIT /* Scalar instruction emitters. * * These contain instruction emitters for scalar ASIMD operations explicitly. * Some of these emitter arguments might seem a bit strange at first glance, * but is because ARM's instruction encodings for these instructions are a hot mess. * * Specifically FP16 was an afterthought for these scalar operations, using a `ScalarRegSize` with * 16-bit wouldn't encode an FP16 instruction because they are a different instruction class instead. * * Most FP16 operations instead have their own freestanding implementation using `HRegister` arguments. * * Meanwhile other FP32 and FP64 instructions will use `ScalarRegSize`, supporting both those sizes. * * For Scalar integer operations, these instructions will mostly support all `ScalarRegSize` operations. * Exceptions to this rule will have asserts in the emitter implementation when misused. * */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif public: // Advanced SIMD scalar copy void dup(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Index) { const uint32_t SizeImm = FEXCore::ToUnderlying(size); const uint32_t IndexShift = SizeImm + 1; const uint32_t ElementSize = 1U << SizeImm; const uint32_t MaxIndex = 128U / (ElementSize * 8); LOGMAN_THROW_A_FMT(Index < MaxIndex, "Index too large. Index={}, Max Index: {}", Index, MaxIndex); const uint32_t imm5 = (Index << IndexShift) | ElementSize; ASIMDScalarCopy(1, 1, imm5, 0b0000, rd, rn); } void mov(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Index) { dup(size, rd, rn, Index); } // Advanced SIMD scalar three same FP16 void fmulx(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(0, 0, 0b011, rm, rn, rd); } void fcmeq(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(0, 0, 0b100, rm, rn, rd); } void frecps(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(0, 0, 0b111, rm, rn, rd); } void frsqrts(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(0, 1, 0b111, rm, rn, rd); } void fcmge(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(1, 0, 0b100, rm, rn, rd); } void facge(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(1, 0, 0b101, rm, rn, rd); } void fabd(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(1, 1, 0b010, rm, rn, rd); } void fcmgt(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(1, 1, 0b100, rm, rn, rd); } void facgt(HRegister rd, HRegister rn, HRegister rm) { ASIMDScalarThreeSameFP16(1, 1, 0b101, rm, rn, rd); } // Advanced SIMD scalar two-register miscellaneous FP16 void fcvtns(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 0, 0b11010, rn, rd); } void fcvtms(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 0, 0b11011, rn, rd); } void fcvtas(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 0, 0b11100, rn, rd); } void scvtf(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 0, 0b11101, rn, rd); } void fcmgt(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b01100, rn, rd); } void fcmeq(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b01101, rn, rd); } void fcmlt(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b01110, rn, rd); } void fcvtps(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b11010, rn, rd); } void fcvtzs(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b11011, rn, rd); } void frecpe(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b11101, rn, rd); } void frecpx(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(0, 1, 0b11111, rn, rd); } void fcvtnu(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 0, 0b11010, rn, rd); } void fcvtmu(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 0, 0b11011, rn, rd); } void fcvtau(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 0, 0b11100, rn, rd); } void ucvtf(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 0, 0b11101, rn, rd); } void fcmge(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 1, 0b01100, rn, rd); } void fcmle(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 1, 0b01101, rn, rd); } void fcvtpu(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 1, 0b11010, rn, rd); } void fcvtzu(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 1, 0b11011, rn, rd); } void frsqrte(HRegister rd, HRegister rn) { ASIMDScalarTwoRegMiscFP16(1, 1, 0b11101, rn, rd); } // Advanced SIMD scalar three same extra void sqrdmlah(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i16Bit || size == ScalarRegSize::i32Bit, "Only supports 16/32-bit"); ASIMDScalarThreeSameExtra(1, size, 0b0000, rm, rn, rd); } void sqrdmlsh(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i16Bit || size == ScalarRegSize::i32Bit, "Only supports 16/32-bit"); ASIMDScalarThreeSameExtra(1, size, 0b0001, rm, rn, rd); } // Advanced SIMD scalar two-register miscellaneous void suqadd(ScalarRegSize size, VRegister rd, VRegister rn) { ASIMDScalar2RegMisc(0, 0, size, 0b00011, rd, rn); } void sqabs(ScalarRegSize size, VRegister rd, VRegister rn) { ASIMDScalar2RegMisc(0, 0, size, 0b00111, rd, rn); } ///< Comparison against 0.0 void cmgt(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 0, size, 0b01000, rd, rn); } ///< Comparison against 0.0 void cmeq(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 0, size, 0b01001, rd, rn); } ///< Comparison against 0.0 void cmlt(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 0, size, 0b01010, rd, rn); } void abs(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 0, size, 0b01011, rd, rn); } ///< size is destination size. void sqxtn(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "64-bit destination not supported"); ASIMDScalar2RegMisc(0, 0, size, 0b10100, rd, rn); } void fcvtns(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 0, ConvertedSize, 0b11010, rd, rn); } void fcvtms(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 0, ConvertedSize, 0b11011, rd, rn); } void fcvtas(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 0, ConvertedSize, 0b11100, rd, rn); } void scvtf(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 0, ConvertedSize, 0b11101, rd, rn); } ///< Comparison against 0.0 void fcmgt(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float compare"); ASIMDScalar2RegMisc(0, 0, size, 0b01100, rd, rn); } ///< Comparison against 0.0 void fcmeq(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float compare"); ASIMDScalar2RegMisc(0, 0, size, 0b01101, rd, rn); } ///< Comparison against 0.0 void fcmlt(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float compare"); ASIMDScalar2RegMisc(0, 0, size, 0b01110, rd, rn); } void fcvtps(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 0, size, 0b11010, rd, rn); } void fcvtzs(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 0, size, 0b11011, rd, rn); } void frecpe(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 0, size, 0b11101, rd, rn); } void frecpx(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 0, size, 0b11111, rd, rn); } void usqadd(ScalarRegSize size, VRegister rd, VRegister rn) { ASIMDScalar2RegMisc(0, 1, size, 0b00011, rd, rn); } void sqneg(ScalarRegSize size, VRegister rd, VRegister rn) { ASIMDScalar2RegMisc(0, 1, size, 0b00111, rd, rn); } ///< Comparison against 0.0 void cmge(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 1, size, 0b01000, rd, rn); } ///< Comparison against 0.0 void cmle(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 1, size, 0b01001, rd, rn); } void neg(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMDScalar2RegMisc(0, 1, size, 0b01011, rd, rn); } ///< size is destination. void sqxtun(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "64-bit destination not supported"); ASIMDScalar2RegMisc(0, 1, size, 0b10010, rd, rn); } ///< size is destination. void uqxtn(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "64-bit destination not supported"); ASIMDScalar2RegMisc(0, 1, size, 0b10100, rd, rn); } ///< size is destination. void fcvtxn(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 1, ScalarRegSize::i16Bit, 0b10110, rd, rn); } void fcvtnu(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 1, ConvertedSize, 0b11010, rd, rn); } void fcvtmu(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 1, ConvertedSize, 0b11011, rd, rn); } void fcvtau(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 1, ConvertedSize, 0b11100, rd, rn); } void ucvtf(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(0, 1, ConvertedSize, 0b11101, rd, rn); } ///< Comparison against 0.0 void fcmge(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 1, size, 0b01100, rd, rn); } ///< Comparison against 0.0 void fcmle(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 1, size, 0b01101, rd, rn); } void fcvtpu(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 1, size, 0b11010, rd, rn); } void fcvtzu(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 1, size, 0b11011, rd, rn); } void frsqrte(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(0, 1, size, 0b11101, rd, rn); } // Advanced SIMD scalar pairwise void addp(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Invalid size selected for addp"); ASIMDScalar2RegMisc(1, 0, size, 0b11011, rd, rn); } void fmaxnmp(HRegister rd, HRegister rn) { ASIMDScalar2RegMisc(1, 0, ScalarRegSize::i8Bit, 0b01100, rd.V(), rn.V()); } void faddp(HRegister rd, HRegister rn) { ASIMDScalar2RegMisc(1, 0, ScalarRegSize::i8Bit, 0b01101, rd.V(), rn.V()); } void fmaxp(HRegister rd, HRegister rn) { ASIMDScalar2RegMisc(1, 0, ScalarRegSize::i8Bit, 0b01111, rd.V(), rn.V()); } void fminnmp(HRegister rd, HRegister rn) { ASIMDScalar2RegMisc(1, 0, ScalarRegSize::i32Bit, 0b01100, rd.V(), rn.V()); } void fminp(HRegister rd, HRegister rn) { ASIMDScalar2RegMisc(1, 0, ScalarRegSize::i32Bit, 0b01111, rd.V(), rn.V()); } void fmaxnmp(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(1, 1, ConvertedSize, 0b01100, rd, rn); } void faddp(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(1, 1, ConvertedSize, 0b01101, rd, rn); } void fmaxp(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMDScalar2RegMisc(1, 1, ConvertedSize, 0b01111, rd, rn); } void fminnmp(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(1, 1, size, 0b01100, rd, rn); } void fminp(ScalarRegSize size, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMDScalar2RegMisc(1, 1, size, 0b01111, rd, rn); } // Advanced SIMD scalar three different ///< size is destination. void sqdmlal(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i32Bit : ScalarRegSize::i16Bit; ASIMD3RegDifferent(0, ConvertedSize, 0b1001, rd, rn, rm); } ///< size is destination. void sqdmlsl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i32Bit : ScalarRegSize::i16Bit; ASIMD3RegDifferent(0, ConvertedSize, 0b1011, rd, rn, rm); } ///< size is destination. void sqdmull(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i32Bit : ScalarRegSize::i16Bit; ASIMD3RegDifferent(0, ConvertedSize, 0b1101, rd, rn, rm); } // Advanced SIMD scalar three same void sqadd(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(0, size, 0b00001, rd, rn, rm); } void sqsub(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(0, size, 0b00101, rd, rn, rm); } void cmgt(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(0, size, 0b00110, rd, rn, rm); } void cmge(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(0, size, 0b00111, rd, rn, rm); } void sshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(0, size, 0b01000, rd, rn, rm); } void sqshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(0, size, 0b01001, rd, rn, rm); } void srshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(0, size, 0b01010, rd, rn, rm); } void sqrshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(0, size, 0b01011, rd, rn, rm); } void add(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(0, size, 0b10000, rd, rn, rm); } void cmtst(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(0, size, 0b10001, rd, rn, rm); } void sqdmulh(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i32Bit || size == ScalarRegSize::i16Bit, "Invalid size"); ASIMD3RegSame(0, size, 0b10110, rd, rn, rm); } void fmulx(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMD3RegSame(0, ConvertedSize, 0b11011, rd, rn, rm); } void fcmeq(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMD3RegSame(0, ConvertedSize, 0b11100, rd, rn, rm); } void frecps(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMD3RegSame(0, ConvertedSize, 0b11111, rd, rn, rm); } void frsqrts(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMD3RegSame(0, size, 0b11111, rd, rn, rm); } void uqadd(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(1, size, 0b00001, rd, rn, rm); } void uqsub(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(1, size, 0b00101, rd, rn, rm); } void cmhi(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(1, size, 0b00110, rd, rn, rm); } void cmhs(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(1, size, 0b00111, rd, rn, rm); } void ushl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(1, size, 0b01000, rd, rn, rm); } void uqshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(1, size, 0b01001, rd, rn, rm); } void urshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(1, size, 0b01010, rd, rn, rm); } void uqrshl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { ASIMD3RegSame(1, size, 0b01011, rd, rn, rm); } void sub(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(1, size, 0b10000, rd, rn, rm); } void cmeq(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit, "Only supports 64-bit"); ASIMD3RegSame(1, size, 0b10001, rd, rn, rm); } void sqrdmulh(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i32Bit || size == ScalarRegSize::i16Bit, "Invalid size"); ASIMD3RegSame(1, size, 0b10110, rd, rn, rm); } void fcmge(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMD3RegSame(1, ConvertedSize, 0b11100, rd, rn, rm); } void facge(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); const ScalarRegSize ConvertedSize = size == ScalarRegSize::i64Bit ? ScalarRegSize::i16Bit : ScalarRegSize::i8Bit; ASIMD3RegSame(1, ConvertedSize, 0b11101, rd, rn, rm); } void fabd(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMD3RegSame(1, size, 0b11010, rd, rn, rm); } void fcmgt(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMD3RegSame(1, size, 0b11100, rd, rn, rm); } void facgt(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for float convert"); ASIMD3RegSame(1, size, 0b11101, rd, rn, rm); } // Advanced SIMD scalar shift by immediate void sshr(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b00000, rd, rn); } void ssra(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b00010, rd, rn); } void srshr(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b00100, rd, rn); } void srsra(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b00110, rd, rn); } void shl(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); // Shift encoded a bit weirdly. // shift = immh:immb - elementsize but immh is /also/ used for element size. const uint32_t immh = 1 << FEXCore::ToUnderlying(size) | (Shift >> 3); const uint32_t immb = Shift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b01010, rd, rn); } void sqshl(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); // Shift encoded a bit weirdly. // shift = immh:immb - elementsize but immh is /also/ used for element size. const uint32_t immh = 1 << FEXCore::ToUnderlying(size) | (Shift >> 3); const uint32_t immb = Shift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b01110, rd, rn); } ///< size is destination void sqshrn(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size != ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sqshrn"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b10010, rd, rn); } void sqrshrn(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size != ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sqshrn"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(0, immh, immb, 0b10011, rd, rn); } // TODO: SCVTF, FCVTZS void ushr(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b00000, rd, rn); } void usra(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b00010, rd, rn); } void urshr(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b00100, rd, rn); } void ursra(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b00110, rd, rn); } void sri(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b01000, rd, rn); } void sli(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < 64, "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size == ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sshr"); // Shift encoded a bit weirdly. // shift = immh:immb - elementsize but immh is /also/ used for element size. const uint32_t immh = 1 << FEXCore::ToUnderlying(size) | (Shift >> 3); const uint32_t immb = Shift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b01010, rd, rn); } void sqshlu(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); // Shift encoded a bit weirdly. // shift = immh:immb - elementsize but immh is /also/ used for element size. const uint32_t immh = 1 << FEXCore::ToUnderlying(size) | (Shift >> 3); const uint32_t immb = Shift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b01100, rd, rn); } void uqshl(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); // Shift encoded a bit weirdly. // shift = immh:immb - elementsize but immh is /also/ used for element size. const uint32_t immh = 1 << FEXCore::ToUnderlying(size) | (Shift >> 3); const uint32_t immb = Shift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b01110, rd, rn); } ///< size is destination. void sqshrun(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size != ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sqshrun"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b10000, rd, rn); } ///< size is destination. void sqrshrun(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size != ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sqrshrun"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b10001, rd, rn); } ///< size is destination. void uqshrn(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size != ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sqrshrun"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b10010, rd, rn); } ///< size is destination. void uqrshrn(ScalarRegSize size, VRegister rd, VRegister rn, uint32_t Shift) { LOGMAN_THROW_A_FMT(Shift > 0 && Shift < ScalarRegSizeInBits(size), "Invalid shift for sshr"); LOGMAN_THROW_A_FMT(size != ARMEmitter::ScalarRegSize::i64Bit, "Invalid size selected for sqrshrun"); const size_t SubregSizeInBits = ScalarRegSizeInBits(size); // Shift encoded in immh:immb, but inverted with 128-bit source // shift = (esize * 2) - immh:immb const uint32_t InvertedShift = (SubregSizeInBits * 2) - Shift; const uint32_t immh = InvertedShift >> 3; const uint32_t immb = InvertedShift & 0b111; ASIMDScalarShiftByImm(1, immh, immb, 0b10011, rd, rn); } // TODO: UCVTF, FCVTZU // Advanced SIMD scalar x indexed element void sqdmlal(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(0, size, 0b0011, rm, rn, rd, index); } void sqdmlsl(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(0, size, 0b0111, rm, rn, rd, index); } void sqdmull(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(0, size, 0b1011, rm, rn, rd, index); } void sqdmulh(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(0, size, 0b1100, rm, rn, rd, index); } void sqrdmulh(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(0, size, 0b1101, rm, rn, rd, index); } void fmla(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { ASIMDScalarXIndexedElement(0, size, 0b0001, rm, rn, rd, index); } void fmls(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { ASIMDScalarXIndexedElement(0, size, 0b0101, rm, rn, rd, index); } void fmul(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { ASIMDScalarXIndexedElement(0, size, 0b1001, rm, rn, rd, index); } void sqrdmlah(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(1, size, 0b1101, rm, rn, rd, index); } void sqrdmlsh(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i64Bit, "Scalar size must not be 64-bit"); ASIMDScalarXIndexedElement(1, size, 0b1111, rm, rn, rd, index); } void fmulx(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, uint32_t index) { ASIMDScalarXIndexedElement(1, size, 0b1001, rm, rn, rd, index); } // Floating-point data-processing (1 source) void fmov(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b000000, rd, rn); } void fabs(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b000001, rd, rn); } void fneg(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b000010, rd, rn); } void fsqrt(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b000011, rd, rn); } void frintn(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001000, rd, rn); } void frintp(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001001, rd, rn); } void frintm(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001010, rd, rn); } void frintz(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001011, rd, rn); } void frinta(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001100, rd, rn); } void frintx(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001110, rd, rn); } void frinti(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b001111, rd, rn); } void frint32z(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b010000, rd, rn); } void frint32x(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b010001, rd, rn); } void frint64z(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b010010, rd, rn); } void frint64x(ScalarRegSize size, VRegister rd, VRegister rn) { Float1Source(size, 0, 0, 0b010011, rd, rn); } void fmov(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b000000, rd.V(), rn.V()); } void fabs(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b000001, rd.V(), rn.V()); } void fneg(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b000010, rd.V(), rn.V()); } void fsqrt(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b000011, rd.V(), rn.V()); } void fcvt(DRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b000101, rd.V(), rn.V()); } void fcvt(HRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b000111, rd.V(), rn.V()); } void frintn(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001000, rd.V(), rn.V()); } void frintp(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001001, rd.V(), rn.V()); } void frintm(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001010, rd.V(), rn.V()); } void frintz(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001011, rd.V(), rn.V()); } void frinta(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001100, rd.V(), rn.V()); } void frintx(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001110, rd.V(), rn.V()); } void frinti(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b001111, rd.V(), rn.V()); } void frint32z(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b010000, rd.V(), rn.V()); } void frint32x(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b010001, rd.V(), rn.V()); } void frint64z(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b010010, rd.V(), rn.V()); } void frint64x(SRegister rd, SRegister rn) { Float1Source(0, 0, 0b00, 0b010011, rd.V(), rn.V()); } void fmov(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b000000, rd.V(), rn.V()); } void fabs(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b000001, rd.V(), rn.V()); } void fneg(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b000010, rd.V(), rn.V()); } void fsqrt(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b000011, rd.V(), rn.V()); } void fcvt(SRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b000100, rd.V(), rn.V()); } void bfcvt(HRegister rd, SRegister rn) { Float1Source(0, 0, 0b01, 0b000110, rd.V(), rn.V()); } void fcvt(HRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b000111, rd.V(), rn.V()); } void frintn(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001000, rd.V(), rn.V()); } void frintp(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001001, rd.V(), rn.V()); } void frintm(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001010, rd.V(), rn.V()); } void frintz(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001011, rd.V(), rn.V()); } void frinta(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001100, rd.V(), rn.V()); } void frintx(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001110, rd.V(), rn.V()); } void frinti(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b001111, rd.V(), rn.V()); } void frint32z(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b010000, rd.V(), rn.V()); } void frint32x(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b010001, rd.V(), rn.V()); } void frint64z(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b010010, rd.V(), rn.V()); } void frint64x(DRegister rd, DRegister rn) { Float1Source(0, 0, 0b01, 0b010011, rd.V(), rn.V()); } void fmov(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b000000, rd.V(), rn.V()); } void fabs(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b000001, rd.V(), rn.V()); } void fneg(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b000010, rd.V(), rn.V()); } void fsqrt(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b000011, rd.V(), rn.V()); } void fcvt(SRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b000100, rd.V(), rn.V()); } void fcvt(DRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b000101, rd.V(), rn.V()); } void frintn(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001000, rd.V(), rn.V()); } void frintp(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001001, rd.V(), rn.V()); } void frintm(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001010, rd.V(), rn.V()); } void frintz(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001011, rd.V(), rn.V()); } void frinta(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001100, rd.V(), rn.V()); } void frintx(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001110, rd.V(), rn.V()); } void frinti(HRegister rd, HRegister rn) { Float1Source(0, 0, 0b11, 0b001111, rd.V(), rn.V()); } // Floating-point compare void fcmp(ScalarRegSize Size, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(Size != ScalarRegSize::i8Bit, "8-bit destination not supported"); const auto ConvertedSize = Size == ARMEmitter::ScalarRegSize::i64Bit ? 0b01 : Size == ARMEmitter::ScalarRegSize::i32Bit ? 0b00 : Size == ARMEmitter::ScalarRegSize::i16Bit ? 0b11 : 0; FloatCompare(0, 0, ConvertedSize, 0b00, 0b00000, rn, rm); } void fcmp(SRegister rn, SRegister rm) { FloatCompare(0, 0, 0b00, 0b00, 0b00000, rn.V(), rm.V()); } ///< Compare to #0.0 void fcmp(SRegister rn) { FloatCompare(0, 0, 0b00, 0b00, 0b01000, rn.V(), VReg::v0); } void fcmpe(SRegister rn, SRegister rm) { FloatCompare(0, 0, 0b00, 0b00, 0b10000, rn.V(), rm.V()); } ///< Compare to #0.0 void fcmpe(SRegister rn) { FloatCompare(0, 0, 0b00, 0b00, 0b11000, rn.V(), VReg::v0); } void fcmp(DRegister rn, DRegister rm) { FloatCompare(0, 0, 0b01, 0b00, 0b00000, rn.V(), rm.V()); } ///< Compare to #0.0 void fcmp(DRegister rn) { FloatCompare(0, 0, 0b01, 0b00, 0b01000, rn.V(), VReg::v0); } void fcmpe(DRegister rn, DRegister rm) { FloatCompare(0, 0, 0b01, 0b00, 0b10000, rn.V(), rm.V()); } ///< Compare to #0.0 void fcmpe(DRegister rn) { FloatCompare(0, 0, 0b01, 0b00, 0b11000, rn.V(), VReg::v0); } void fcmp(HRegister rn, HRegister rm) { FloatCompare(0, 0, 0b11, 0b00, 0b00000, rn.V(), rm.V()); } ///< Compare to #0.0 void fcmp(HRegister rn) { FloatCompare(0, 0, 0b11, 0b00, 0b01000, rn.V(), VReg::v0); } void fcmpe(HRegister rn, HRegister rm) { FloatCompare(0, 0, 0b11, 0b00, 0b10000, rn.V(), rm.V()); } ///< Compare to #0.0 void fcmpe(HRegister rn) { FloatCompare(0, 0, 0b11, 0b00, 0b11000, rn.V(), VReg::v0); } // Floating-point immediate void fmov(ARMEmitter::ScalarRegSize size, ARMEmitter::VRegister rd, float Value) { uint32_t M = 0; uint32_t S = 0; uint32_t ptype; uint32_t imm8; uint32_t imm5 = 0b0'0000; if (size == ARMEmitter::ScalarRegSize::i16Bit) { LOGMAN_MSG_A_FMT("Unsupported"); FEX_UNREACHABLE; } else if (size == ARMEmitter::ScalarRegSize::i32Bit) { ptype = 0b00; imm8 = FP32ToImm8(Value); } else if (size == ARMEmitter::ScalarRegSize::i64Bit) { ptype = 0b01; imm8 = FP64ToImm8(Value); } else { FEX_UNREACHABLE; } FloatScalarImmediate(M, S, ptype, imm8, imm5, rd); } void FloatScalarImmediate(uint32_t M, uint32_t S, uint32_t ptype, uint32_t imm8, uint32_t imm5, ARMEmitter::VRegister rd) { constexpr uint32_t Op = 0b0001'1110'0010'0000'0001'00 << 10; uint32_t Instr = Op; Instr |= M << 31; Instr |= S << 29; Instr |= ptype << 22; Instr |= imm8 << 13; Instr |= imm5 << 5; Instr |= rd.Idx(); dc32(Instr); } // Floating-point conditional compare void fccmp(SRegister rn, SRegister rm, StatusFlags flags, Condition Cond) { FloatConditionalCompare(0, 0, 0b00, 0b0, rn.V(), rm.V(), flags, Cond); } void fccmpe(SRegister rn, SRegister rm, StatusFlags flags, Condition Cond) { FloatConditionalCompare(0, 0, 0b00, 0b1, rn.V(), rm.V(), flags, Cond); } void fccmp(DRegister rn, DRegister rm, StatusFlags flags, Condition Cond) { FloatConditionalCompare(0, 0, 0b01, 0b0, rn.V(), rm.V(), flags, Cond); } void fccmpe(DRegister rn, DRegister rm, StatusFlags flags, Condition Cond) { FloatConditionalCompare(0, 0, 0b01, 0b1, rn.V(), rm.V(), flags, Cond); } void fccmp(HRegister rn, HRegister rm, StatusFlags flags, Condition Cond) { FloatConditionalCompare(0, 0, 0b11, 0b0, rn.V(), rm.V(), flags, Cond); } void fccmpe(HRegister rn, HRegister rm, StatusFlags flags, Condition Cond) { FloatConditionalCompare(0, 0, 0b11, 0b1, rn.V(), rm.V(), flags, Cond); } // Floating-point data-processing (2 source) void fmul(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0000, rd, rn, rm); } void fdiv(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0001, rd, rn, rm); } void fadd(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0010, rd, rn, rm); } void fsub(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0011, rd, rn, rm); } void fmax(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0100, rd, rn, rm); } void fmin(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0101, rd, rn, rm); } void fmaxnm(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0110, rd, rn, rm); } void fminnm(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b0111, rd, rn, rm); } void fnmul(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm) { Float2Source(size, 0, 0, 0b1000, rd, rn, rm); } void fmul(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0000, rd.V(), rn.V(), rm.V()); } void fdiv(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0001, rd.V(), rn.V(), rm.V()); } void fadd(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0010, rd.V(), rn.V(), rm.V()); } void fsub(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0011, rd.V(), rn.V(), rm.V()); } void fmax(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0100, rd.V(), rn.V(), rm.V()); } void fmin(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0101, rd.V(), rn.V(), rm.V()); } void fmaxnm(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0110, rd.V(), rn.V(), rm.V()); } void fminnm(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b0111, rd.V(), rn.V(), rm.V()); } void fnmul(SRegister rd, SRegister rn, SRegister rm) { Float2Source(0, 0, 0b00, 0b1000, rd.V(), rn.V(), rm.V()); } void fmul(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0000, rd.V(), rn.V(), rm.V()); } void fdiv(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0001, rd.V(), rn.V(), rm.V()); } void fadd(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0010, rd.V(), rn.V(), rm.V()); } void fsub(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0011, rd.V(), rn.V(), rm.V()); } void fmax(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0100, rd.V(), rn.V(), rm.V()); } void fmin(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0101, rd.V(), rn.V(), rm.V()); } void fmaxnm(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0110, rd.V(), rn.V(), rm.V()); } void fminnm(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b0111, rd.V(), rn.V(), rm.V()); } void fnmul(DRegister rd, DRegister rn, DRegister rm) { Float2Source(0, 0, 0b01, 0b1000, rd.V(), rn.V(), rm.V()); } void fmul(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0000, rd.V(), rn.V(), rm.V()); } void fdiv(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0001, rd.V(), rn.V(), rm.V()); } void fadd(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0010, rd.V(), rn.V(), rm.V()); } void fsub(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0011, rd.V(), rn.V(), rm.V()); } void fmax(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0100, rd.V(), rn.V(), rm.V()); } void fmin(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0101, rd.V(), rn.V(), rm.V()); } void fmaxnm(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0110, rd.V(), rn.V(), rm.V()); } void fminnm(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b0111, rd.V(), rn.V(), rm.V()); } void fnmul(HRegister rd, HRegister rn, HRegister rm) { Float2Source(0, 0, 0b11, 0b1000, rd.V(), rn.V(), rm.V()); } // Floating-point conditional select void fcsel(ScalarRegSize size, VRegister rd, VRegister rn, VRegister rm, Condition Cond) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i16Bit || size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for {}", __func__); const uint32_t ConvertedSize = size == ScalarRegSize::i64Bit ? 0b01 : size == ScalarRegSize::i32Bit ? 0b00 : 0b11; FloatConditionalSelect(0, 0, ConvertedSize, rd, rn, rm, Cond); } void fcsel(SRegister rd, SRegister rn, SRegister rm, Condition Cond) { FloatConditionalSelect(0, 0, 0b00, rd.V(), rn.V(), rm.V(), Cond); } void fcsel(DRegister rd, DRegister rn, DRegister rm, Condition Cond) { FloatConditionalSelect(0, 0, 0b01, rd.V(), rn.V(), rm.V(), Cond); } void fcsel(HRegister rd, HRegister rn, HRegister rm, Condition Cond) { FloatConditionalSelect(0, 0, 0b11, rd.V(), rn.V(), rm.V(), Cond); } // Floating-point data-processing (3 source) void fmadd(SRegister rd, SRegister rn, SRegister rm, SRegister ra) { Float3Source(0, 0, 0b00, 0, 0, rd.V(), rn.V(), rm.V(), ra.V()); } void fmsub(SRegister rd, SRegister rn, SRegister rm, SRegister ra) { Float3Source(0, 0, 0b00, 0, 1, rd.V(), rn.V(), rm.V(), ra.V()); } void fnmadd(SRegister rd, SRegister rn, SRegister rm, SRegister ra) { Float3Source(0, 0, 0b00, 1, 0, rd.V(), rn.V(), rm.V(), ra.V()); } void fnmsub(SRegister rd, SRegister rn, SRegister rm, SRegister ra) { Float3Source(0, 0, 0b00, 1, 1, rd.V(), rn.V(), rm.V(), ra.V()); } void fmadd(DRegister rd, DRegister rn, DRegister rm, DRegister ra) { Float3Source(0, 0, 0b01, 0, 0, rd.V(), rn.V(), rm.V(), ra.V()); } void fmsub(DRegister rd, DRegister rn, DRegister rm, DRegister ra) { Float3Source(0, 0, 0b01, 0, 1, rd.V(), rn.V(), rm.V(), ra.V()); } void fnmadd(DRegister rd, DRegister rn, DRegister rm, DRegister ra) { Float3Source(0, 0, 0b01, 1, 0, rd.V(), rn.V(), rm.V(), ra.V()); } void fnmsub(DRegister rd, DRegister rn, DRegister rm, DRegister ra) { Float3Source(0, 0, 0b01, 1, 1, rd.V(), rn.V(), rm.V(), ra.V()); } void fmadd(HRegister rd, HRegister rn, HRegister rm, HRegister ra) { Float3Source(0, 0, 0b11, 0, 0, rd.V(), rn.V(), rm.V(), ra.V()); } void fmsub(HRegister rd, HRegister rn, HRegister rm, HRegister ra) { Float3Source(0, 0, 0b11, 0, 1, rd.V(), rn.V(), rm.V(), ra.V()); } void fnmadd(HRegister rd, HRegister rn, HRegister rm, HRegister ra) { Float3Source(0, 0, 0b11, 1, 0, rd.V(), rn.V(), rm.V(), ra.V()); } void fnmsub(HRegister rd, HRegister rn, HRegister rm, HRegister ra) { Float3Source(0, 0, 0b11, 1, 1, rd.V(), rn.V(), rm.V(), ra.V()); } private: // Advanced SIMD scalar copy void ASIMDScalarCopy(uint32_t Q, uint32_t b28, uint32_t imm5, uint32_t imm4, VRegister rd, VRegister rn) { uint32_t Instr = 0b0000'1110'0000'0000'0000'01U << 10; Instr |= Q << 30; Instr |= b28 << 28; Instr |= imm5 << 16; Instr |= imm4 << 11; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD scalar three same FP16 void ASIMDScalarThreeSameFP16(uint32_t U, uint32_t a, uint32_t opcode, HRegister rm, HRegister rn, HRegister rd) { uint32_t Instr = 0b0101'1110'0100'0000'0000'0100'0000'0000; Instr |= U << 29; Instr |= a << 23; Instr |= rm.Idx() << 16; Instr |= opcode << 11; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Advanced SIMD scalar two-register miscellaneous FP16 void ASIMDScalarTwoRegMiscFP16(uint32_t U, uint32_t a, uint32_t opcode, HRegister rn, HRegister rd) { uint32_t Instr = 0b0101'1110'0111'1000'0000'1000'0000'0000; Instr |= U << 29; Instr |= a << 23; Instr |= opcode << 12; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Advanced SIMD scalar three same extra void ASIMDScalarThreeSameExtra(uint32_t U, ScalarRegSize size, uint32_t opcode, VRegister rm, VRegister rn, VRegister rd) { uint32_t Instr = 0b0101'1110'0000'0000'1000'0100'0000'0000; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= rm.Idx() << 16; Instr |= opcode << 11; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Advanced SIMD scalar two-register miscellaneous void ASIMDScalar2RegMisc(uint32_t b20, uint32_t U, ScalarRegSize size, uint32_t opcode, VRegister rd, VRegister rn) { uint32_t Instr = 0b0101'1110'0010'0000'0000'1000'0000'0000; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= b20 << 20; Instr |= opcode << 12; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Advanced SIMD scalar three different void ASIMD3RegDifferent(uint32_t U, ScalarRegSize size, uint32_t opcode, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0101'1110'0010'0000'0000'0000'0000'0000; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= Encode_rm(rm); Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD scalar three same void ASIMD3RegSame(uint32_t U, ScalarRegSize size, uint32_t opcode, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0101'1110'0010'0000'0000'0100'0000'0000; Instr |= U << 29; Instr |= FEXCore::ToUnderlying(size) << 22; Instr |= Encode_rm(rm); Instr |= opcode << 11; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD scalar shift by immediate void ASIMDScalarShiftByImm(uint32_t U, uint32_t immh, uint32_t immb, uint32_t opcode, VRegister rd, VRegister rn) { uint32_t Instr = 0b0101'1111'0000'0000'0000'0100'0000'0000; Instr |= U << 29; Instr |= immh << 19; Instr |= immb << 16; Instr |= opcode << 11; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } // Advanced SIMD scalar x indexed element void ASIMDScalarXIndexedElement(uint32_t U, ScalarRegSize size, uint32_t opcode, VRegister rm, VRegister rn, VRegister rd, uint32_t index) { LOGMAN_THROW_A_FMT(size != ScalarRegSize::i8Bit, "Scalar size must not be 8-bit"); const auto invalid_bound = 16U >> FEXCore::ToUnderlying(size); LOGMAN_THROW_A_FMT(index < invalid_bound, "Index ({}) must be within [0-{}]", index, invalid_bound - 1); uint32_t Instr = 0b0101'1111'0000'0000'0000'0000'0000'0000; // FMUL/FMLA/FMLS indexed variants deal with size differently. if (opcode == 0b0001 || opcode == 0b0101 || opcode == 0b1001) { // Unlike other instructions in the group, 16-bit is encoded as zero // and 32/64-bit are encoded with the top bit always set to one. if (size != ScalarRegSize::i16Bit) { Instr |= (0b10 | (FEXCore::ToUnderlying(size) & 1)) << 22; } } else { Instr |= FEXCore::ToUnderlying(size) << 22; } uint32_t H = 0; uint32_t LM = 0; if (size == ScalarRegSize::i16Bit) { LOGMAN_THROW_A_FMT(rm <= VReg::v15, "rm ({}) must be within [v0-v15]", rm.Idx()); H = (index >> 2) & 1; LM = index & 0b11; } else if (size == ScalarRegSize::i32Bit) { H = (index >> 1) & 1; LM = (index & 0b01) << 1; } else { H = index & 1; } Instr |= U << 29; Instr |= LM << 20; Instr |= rm.Idx() << 16; Instr |= opcode << 12; Instr |= H << 11; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Floating-point data-processing (1 source) void Float1Source(uint32_t M, uint32_t S, uint32_t ptype, uint32_t opcode, VRegister rd, VRegister rn) { uint32_t Instr = 0b0001'1110'0010'0000'0100'0000'0000'0000; Instr |= M << 31; Instr |= S << 29; Instr |= ptype << 22; Instr |= opcode << 15; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } void Float1Source(ScalarRegSize size, uint32_t M, uint32_t S, uint32_t opcode, VRegister rd, VRegister rn) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i16Bit || size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for {}", __func__); const uint32_t ConvertedSize = size == ScalarRegSize::i64Bit ? 0b01 : size == ScalarRegSize::i32Bit ? 0b00 : 0b11; Float1Source(M, S, ConvertedSize, opcode, rd, rn); } // Floating-point compare void FloatCompare(uint32_t M, uint32_t S, uint32_t ftype, uint32_t op, uint32_t opcode2, VRegister rn, VRegister rm) { uint32_t Instr = 0b0001'1110'0010'0000'0010'0000'0000'0000; Instr |= M << 31; Instr |= S << 29; Instr |= ftype << 22; Instr |= Encode_rm(rm); Instr |= op << 14; Instr |= Encode_rn(rn); Instr |= opcode2; dc32(Instr); } // Floating-point immediate // XXX: // Floating-point conditional compare void FloatConditionalCompare(uint32_t M, uint32_t S, uint32_t ptype, uint32_t op, VRegister rn, VRegister rm, StatusFlags flags, Condition Cond) { uint32_t Instr = 0b0001'1110'0010'0000'0000'0100'0000'0000; Instr |= M << 31; Instr |= S << 29; Instr |= ptype << 22; Instr |= Encode_rm(rm); Instr |= FEXCore::ToUnderlying(Cond) << 12; Instr |= Encode_rn(rn); Instr |= op << 4; Instr |= FEXCore::ToUnderlying(flags); dc32(Instr); } // Floating-point data-processing (2 source) void Float2Source(uint32_t M, uint32_t S, uint32_t ptype, uint32_t opcode, VRegister rd, VRegister rn, VRegister rm) { uint32_t Instr = 0b0001'1110'0010'0000'0000'1000'0000'0000; Instr |= M << 31; Instr |= S << 29; Instr |= ptype << 22; Instr |= Encode_rm(rm); Instr |= opcode << 12; Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } void Float2Source(ScalarRegSize size, uint32_t M, uint32_t S, uint32_t opcode, VRegister rd, VRegister rn, VRegister rm) { LOGMAN_THROW_A_FMT(size == ScalarRegSize::i16Bit || size == ScalarRegSize::i64Bit || size == ScalarRegSize::i32Bit, "Invalid size selected for {}", __func__); const uint32_t ConvertedSize = size == ScalarRegSize::i64Bit ? 0b01 : size == ScalarRegSize::i32Bit ? 0b00 : 0b11; Float2Source(M, S, ConvertedSize, opcode, rd, rn, rm); } // Floating-point conditional select void FloatConditionalSelect(uint32_t M, uint32_t S, uint32_t ptype, VRegister rd, VRegister rn, VRegister rm, Condition Cond) { uint32_t Instr = 0b0001'1110'0010'0000'0000'1100'0000'0000; Instr |= M << 31; Instr |= S << 29; Instr |= ptype << 22; Instr |= rm.Idx() << 16; Instr |= FEXCore::ToUnderlying(Cond) << 12; Instr |= rn.Idx() << 5; Instr |= rd.Idx(); dc32(Instr); } // Floating-point data-processing (3 source) void Float3Source(uint32_t M, uint32_t S, uint32_t ptype, uint32_t o1, uint32_t o0, VRegister rd, VRegister rn, VRegister rm, VRegister ra) { uint32_t Instr = 0b0001'1111'0000'0000'0000'0000'0000'0000; Instr |= M << 31; Instr |= S << 29; Instr |= ptype << 22; Instr |= o1 << 21; Instr |= Encode_rm(rm); Instr |= o0 << 15; Instr |= Encode_ra(ra); Instr |= Encode_rn(rn); Instr |= Encode_rd(rd); dc32(Instr); } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/SystemOps.inl ================================================ // SPDX-License-Identifier: MIT /* System instruction emitters. * * This is mostly a mashup of various instruction types. * Nothing follows an explicit pattern since they are mostly different. */ #pragma once #ifndef INCLUDED_BY_EMITTER #include namespace ARMEmitter { struct EmitterOps : Emitter { #endif public: // Reserved void udf(uint32_t Imm) { LOGMAN_THROW_A_FMT(Imm < 0x1'0000, "Immediate needs to be 16-bit"); dc32(Imm); } // System with result // TODO: SYSL // System Instruction // TODO: AT // TODO: CFP // TODO: CPP void dc(ARMEmitter::DataCacheOperation DCOp, ARMEmitter::Register rt) { constexpr uint32_t Op = 0b1101'0101'0000'1000'0111 << 12; SystemInstruction(Op, 0, FEXCore::ToUnderlying(DCOp), rt); } // TODO: DVP // TODO: IC // TODO: TLBI // Exception generation void svc(uint32_t Imm) { ExceptionGeneration(0b000, 0b000, 0b01, Imm); } void hvc(uint32_t Imm) { ExceptionGeneration(0b000, 0b000, 0b10, Imm); } void smc(uint32_t Imm) { ExceptionGeneration(0b000, 0b000, 0b11, Imm); } void brk(uint32_t Imm) { ExceptionGeneration(0b001, 0b000, 0b00, Imm); } void hlt(uint32_t Imm) { ExceptionGeneration(0b010, 0b000, 0b00, Imm); } void tcancel(uint32_t Imm) { ExceptionGeneration(0b011, 0b000, 0b00, Imm); } void dcps1(uint32_t Imm) { ExceptionGeneration(0b101, 0b000, 0b01, Imm); } void dcps2(uint32_t Imm) { ExceptionGeneration(0b101, 0b000, 0b10, Imm); } void dcps3(uint32_t Imm) { ExceptionGeneration(0b101, 0b000, 0b11, Imm); } // System instructions with register argument void wfet(ARMEmitter::Register rt) { SystemInstructionWithReg(0b0000, 0b000, rt); } void wfit(ARMEmitter::Register rt) { SystemInstructionWithReg(0b0000, 0b001, rt); } // Hints void nop() { Hint(ARMEmitter::HintRegister::NOP); } void yield() { Hint(ARMEmitter::HintRegister::YIELD); } void wfe() { Hint(ARMEmitter::HintRegister::WFE); } void wfi() { Hint(ARMEmitter::HintRegister::WFI); } void sev() { Hint(ARMEmitter::HintRegister::SEV); } void sevl() { Hint(ARMEmitter::HintRegister::SEVL); } void dgh() { Hint(ARMEmitter::HintRegister::DGH); } void csdb() { Hint(ARMEmitter::HintRegister::CSDB); } // Barriers void clrex(uint32_t imm = 15) { LOGMAN_THROW_A_FMT(imm < 16, "Immediate out of range"); Barrier(ARMEmitter::BarrierRegister::CLREX, imm); } void dsb(ARMEmitter::BarrierScope Scope) { Barrier(ARMEmitter::BarrierRegister::DSB, FEXCore::ToUnderlying(Scope)); } void dmb(ARMEmitter::BarrierScope Scope) { Barrier(ARMEmitter::BarrierRegister::DMB, FEXCore::ToUnderlying(Scope)); } void isb() { Barrier(ARMEmitter::BarrierRegister::ISB, FEXCore::ToUnderlying(ARMEmitter::BarrierScope::SY)); } void sb() { Barrier(ARMEmitter::BarrierRegister::SB, 0); } void tcommit() { Barrier(ARMEmitter::BarrierRegister::TCOMMIT, 0); } // System register move void msr(ARMEmitter::SystemRegister reg, ARMEmitter::Register rt) { constexpr uint32_t Op = 0b1101'0101'0001 << 20; SystemRegisterMove(Op, rt, reg); } void mrs(ARMEmitter::Register rd, ARMEmitter::SystemRegister reg) { constexpr uint32_t Op = 0b1101'0101'0011 << 20; SystemRegisterMove(Op, rd, reg); } private: // Exception Generation void ExceptionGeneration(uint32_t opc, uint32_t op2, uint32_t LL, uint32_t Imm) { LOGMAN_THROW_A_FMT((Imm & 0xFFFF'0000) == 0, "Imm amount too large"); uint32_t Instr = 0b1101'0100 << 24; Instr |= opc << 21; Instr |= Imm << 5; Instr |= op2 << 2; Instr |= LL; dc32(Instr); } // System instructions with register argument void SystemInstructionWithReg(uint32_t CRm, uint32_t op2, ARMEmitter::Register rt) { uint32_t Instr = 0b1101'0101'0000'0011'0001 << 12; Instr |= CRm << 8; Instr |= op2 << 5; Instr |= Encode_rt(rt); dc32(Instr); } // Hints void Hint(ARMEmitter::HintRegister Reg) { uint32_t Instr = 0b1101'0101'0000'0011'0010'0000'0001'1111U; Instr |= FEXCore::ToUnderlying(Reg); dc32(Instr); } // Barriers void Barrier(ARMEmitter::BarrierRegister Reg, uint32_t CRm) { uint32_t Instr = 0b1101'0101'0000'0011'0011'0000'0001'1111U; Instr |= CRm << 8; Instr |= FEXCore::ToUnderlying(Reg); dc32(Instr); } // System Instruction void SystemInstruction(uint32_t Op, uint32_t L, uint32_t SubOp, ARMEmitter::Register rt) { uint32_t Instr = Op; Instr |= L << 21; Instr |= SubOp; Instr |= Encode_rt(rt); dc32(Instr); } // System register move void SystemRegisterMove(uint32_t Op, ARMEmitter::Register rt, ARMEmitter::SystemRegister reg) { uint32_t Instr = Op; Instr |= FEXCore::ToUnderlying(reg); Instr |= Encode_rt(rt); dc32(Instr); } #ifndef INCLUDED_BY_EMITTER }; // struct LoadstoreEmitterOps } // namespace ARMEmitter #endif ================================================ FILE: CodeEmitter/CodeEmitter/VixlUtils.inl ================================================ // Collection of utilities from vixl. // Following is the vixl license. // Copyright 2015, VIXL authors // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of ARM Limited nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Test if a given value can be encoded in the immediate field of a logical // instruction. // If it can be encoded, the function returns true, and values pointed to by n, // imm_s and imm_r are updated with immediates encoded in the format required // by the corresponding fields in the logical instruction. // If it can not be encoded, the function returns false, and the values pointed // to by n, imm_s and imm_r are undefined. static bool IsImmLogical(uint64_t value, unsigned width, unsigned* n = nullptr, unsigned* imm_s = nullptr, unsigned* imm_r = nullptr) { [[maybe_unused]] constexpr auto kBRegSize = 8; [[maybe_unused]] constexpr auto kHRegSize = 16; [[maybe_unused]] constexpr auto kSRegSize = 32; [[maybe_unused]] constexpr auto kDRegSize = 64; constexpr auto kWRegSize = 32; LOGMAN_THROW_A_FMT((width == kBRegSize) || (width == kHRegSize) || (width == kSRegSize) || (width == kDRegSize), "Unexpected imm size"); bool negate = false; // Logical immediates are encoded using parameters n, imm_s and imm_r using // the following table: // // N imms immr size S R // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) // 0 11110s xxxxxr 2 UInt(s) UInt(r) // (s bits must not be all set) // // A pattern is constructed of size bits, where the least significant S+1 bits // are set. The pattern is rotated right by R, and repeated across a 32 or // 64-bit value, depending on destination register width. // // Put another way: the basic format of a logical immediate is a single // contiguous stretch of 1 bits, repeated across the whole word at intervals // given by a power of 2. To identify them quickly, we first locate the // lowest stretch of 1 bits, then the next 1 bit above that; that combination // is different for every logical immediate, so it gives us all the // information we need to identify the only logical immediate that our input // could be, and then we simply check if that's the value we actually have. // // (The rotation parameter does give the possibility of the stretch of 1 bits // going 'round the end' of the word. To deal with that, we observe that in // any situation where that happens the bitwise NOT of the value is also a // valid logical immediate. So we simply invert the input whenever its low bit // is set, and then we know that the rotated case can't arise.) if (value & 1) { // If the low bit is 1, negate the value, and set a flag to remember that we // did (so that we can adjust the return values appropriately). negate = true; value = ~value; } if (width <= kWRegSize) { // To handle 8/16/32-bit logical immediates, the very easiest thing is to repeat // the input value to fill a 64-bit word. The correct encoding of that as a // logical immediate will also be the correct encoding of the value. // Avoid making the assumption that the most-significant 56/48/32 bits are zero by // shifting the value left and duplicating it. for (unsigned bits = width; bits <= kWRegSize; bits *= 2) { value <<= bits; uint64_t mask = (UINT64_C(1) << bits) - 1; value |= ((value >> bits) & mask); } } // The basic analysis idea: imagine our input word looks like this. // // 0011111000111110001111100011111000111110001111100011111000111110 // c b a // |<--d-->| // // We find the lowest set bit (as an actual power-of-2 value, not its index) // and call it a. Then we add a to our original number, which wipes out the // bottommost stretch of set bits and replaces it with a 1 carried into the // next zero bit. Then we look for the new lowest set bit, which is in // position b, and subtract it, so now our number is just like the original // but with the lowest stretch of set bits completely gone. Now we find the // lowest set bit again, which is position c in the diagram above. Then we'll // measure the distance d between bit positions a and c (using CLZ), and that // tells us that the only valid logical immediate that could possibly be equal // to this number is the one in which a stretch of bits running from a to just // below b is replicated every d bits. uint64_t a = LowestSetBit(value); uint64_t value_plus_a = value + a; uint64_t b = LowestSetBit(value_plus_a); uint64_t value_plus_a_minus_b = value_plus_a - b; uint64_t c = LowestSetBit(value_plus_a_minus_b); int d, clz_a, out_n; uint64_t mask; if (c != 0) { // The general case, in which there is more than one stretch of set bits. // Compute the repeat distance d, and set up a bitmask covering the basic // unit of repetition (i.e. a word with the bottom d bits set). Also, in all // of these cases the N bit of the output will be zero. clz_a = std::countl_zero(a); int clz_c = std::countl_zero(c); d = clz_a - clz_c; mask = ((UINT64_C(1) << d) - 1); out_n = 0; } else { // Handle degenerate cases. // // If any of those 'find lowest set bit' operations didn't find a set bit at // all, then the word will have been zero thereafter, so in particular the // last lowest_set_bit operation will have returned zero. So we can test for // all the special case conditions in one go by seeing if c is zero. if (a == 0) { // The input was zero (or all 1 bits, which will come to here too after we // inverted it at the start of the function), for which we just return // false. return false; } else { // Otherwise, if c was zero but a was not, then there's just one stretch // of set bits in our word, meaning that we have the trivial case of // d == 64 and only one 'repetition'. Set up all the same variables as in // the general case above, and set the N bit in the output. clz_a = std::countl_zero(a); d = 64; mask = ~UINT64_C(0); out_n = 1; } } // If the repeat period d is not a power of two, it can't be encoded. if (!std::has_single_bit(uint32_t(d))) { return false; } if (((b - a) & ~mask) != 0) { // If the bit stretch (b - a) does not fit within the mask derived from the // repeat period, then fail. return false; } // The only possible option is b - a repeated every d bits. Now we're going to // actually construct the valid logical immediate derived from that // specification, and see if it equals our original input. // // To repeat a value every d bits, we multiply it by a number of the form // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can // be derived using a table lookup on CLZ(d). static const uint64_t multipliers[] = { 0x0000000000000001UL, 0x0000000100000001UL, 0x0001000100010001UL, 0x0101010101010101UL, 0x1111111111111111UL, 0x5555555555555555UL, }; uint64_t multiplier = multipliers[std::countl_zero(uint64_t(d)) - 57]; uint64_t candidate = (b - a) * multiplier; if (value != candidate) { // The candidate pattern doesn't match our input value, so fail. return false; } // We have a match! This is a valid logical immediate, so now we have to // construct the bits and pieces of the instruction encoding that generates // it. // Count the set bits in our basic stretch. The special case of clz(0) == -1 // makes the answer come out right for stretches that reach the very top of // the word (e.g. numbers like 0xffffc00000000000). int clz_b = (b == 0) ? -1 : std::countl_zero(b); int s = clz_a - clz_b; // Decide how many bits to rotate right by, to put the low bit of that basic // stretch in position a. int r; if (negate) { // If we inverted the input right at the start of this function, here's // where we compensate: the number of set bits becomes the number of clear // bits, and the rotation count is based on position b rather than position // a (since b is the location of the 'lowest' 1 bit after inversion). s = d - s; r = (clz_b + 1) & (d - 1); } else { r = (clz_a + 1) & (d - 1); } // Now we're done, except for having to encode the S output in such a way that // it gives both the number of set bits and the length of the repeated // segment. The s field is encoded like this: // // imms size S // ssssss 64 UInt(ssssss) // 0sssss 32 UInt(sssss) // 10ssss 16 UInt(ssss) // 110sss 8 UInt(sss) // 1110ss 4 UInt(ss) // 11110s 2 UInt(s) // // So we 'or' (2 * -d) with our computed s to form imms. if (n != nullptr) { *n = out_n; } if (imm_s != nullptr) { *imm_s = ((2 * -d) | (s - 1)) & 0x3f; } if (imm_r != nullptr) { *imm_r = r; } return true; } static inline bool IsIntN(unsigned n, int64_t x) { if (n == 64) { return true; } int64_t limit = INT64_C(1) << (n - 1); return (-limit <= x) && (x < limit); } static inline bool IsUintN(unsigned n, int64_t x) { // Convert to an unsigned integer to avoid implementation-defined behavior. return !(static_cast(x) >> n); } // clang-format off #define INT_1_TO_32_LIST(V) \ V(1) V(2) V(3) V(4) V(5) V(6) V(7) V(8) \ V(9) V(10) V(11) V(12) V(13) V(14) V(15) V(16) \ V(17) V(18) V(19) V(20) V(21) V(22) V(23) V(24) \ V(25) V(26) V(27) V(28) V(29) V(30) V(31) V(32) #define INT_33_TO_63_LIST(V) \ V(33) V(34) V(35) V(36) V(37) V(38) V(39) V(40) \ V(41) V(42) V(43) V(44) V(45) V(46) V(47) V(48) \ V(49) V(50) V(51) V(52) V(53) V(54) V(55) V(56) \ V(57) V(58) V(59) V(60) V(61) V(62) V(63) #define INT_1_TO_63_LIST(V) INT_1_TO_32_LIST(V) INT_33_TO_63_LIST(V) // clang-format on #define DECLARE_IS_INT_N(N) \ static inline bool IsInt##N(int64_t x) { \ return IsIntN(N, x); \ } #define DECLARE_IS_UINT_N(N) \ static inline bool IsUint##N(int64_t x) { \ return IsUintN(N, x); \ } INT_1_TO_63_LIST(DECLARE_IS_INT_N) INT_1_TO_63_LIST(DECLARE_IS_UINT_N) #undef DECLARE_IS_INT_N #undef DECLARE_IS_UINT_N private: // Some compilers dislike negating unsigned integers, // so we provide an equivalent. template static inline T UnsignedNegate(T value) { static_assert(std::is_unsigned::value); return ~value + 1; } static inline uint64_t LowestSetBit(uint64_t value) { return value & UnsignedNegate(value); } public: ================================================ FILE: Data/AppConfig/CMakeLists.txt ================================================ file(GLOB CONFIG_SOURCES CONFIGURE_DEPENDS *.json) file(GLOB GEN_CONFIG_SOURCES CONFIGURE_DEPENDS *.json.in) # Any application configuration json file gets installed foreach(CONFIG_SRC ${CONFIG_SOURCES}) install(FILES ${CONFIG_SRC} DESTINATION ${DATA_DIRECTORY}/AppConfig/ COMPONENT Runtime) endforeach() # Any configuration file json file that needs to be generated # First generate then install it foreach(GEN_CONFIG_SRC ${GEN_CONFIG_SOURCES}) # Get the filename only component get_filename_component(CONFIG_NAME ${GEN_CONFIG_SRC} NAME_WLE) # Configure it configure_file(${GEN_CONFIG_SRC} ${CMAKE_BINARY_DIR}/Data/AppConfig/${CONFIG_NAME}) # Then install the configured json install(FILES ${CMAKE_BINARY_DIR}/Data/AppConfig/${CONFIG_NAME} DESTINATION ${DATA_DIRECTORY}/AppConfig/ COMPONENT Runtime) endforeach() ================================================ FILE: Data/AppConfig/client.json ================================================ { "Config": { "HideHypervisorBit": "1" } } ================================================ FILE: Data/AppConfig/steamwebhelper.json ================================================ { "Comment": "Bypasses libGL's glX and instead sends GLX requests directly via xcb", "ThunksDB": { "GL": 0 } } ================================================ FILE: Data/CI/FEXLinuxTestsThunks.json ================================================ { "ThunksDB": { "fex_thunk_test": 1 } } ================================================ FILE: Data/CI/GLThunks.json ================================================ { "ThunksDB": { "GL": 1 } } ================================================ FILE: Data/CI/VulkanThunks.json ================================================ { "ThunksDB": { "Vulkan": 1 } } ================================================ FILE: Data/CMake/FindZycore.cmake ================================================ # SPDX-License-Identifier: MIT if (CMAKE_CROSSCOMPILING) return() endif() include(FindPackageHandleStandardArgs) find_package(Zycore QUIET CONFIG) if (Zycore_CONSIDERED_CONFIGS) find_package_handle_standard_args(Zycore CONFIG_MODE) else() find_package(PkgConfig QUIET) pkg_search_module(Zycore QUIET IMPORTED_TARGET zycore) find_package_handle_standard_args(Zycore REQUIRED_VARS zycore_LINK_LIBRARIES VERSION_VAR zycore_VERSION) if (TARGET PkgConfig::zycore) add_library(Zycore::Zycore ALIAS PkgConfig::zycore) endif() endif() ================================================ FILE: Data/CMake/FindZydis.cmake ================================================ # SPDX-License-Identifier: MIT if (CMAKE_CROSSCOMPILING) return() endif() include(FindPackageHandleStandardArgs) find_package(Zydis QUIET CONFIG) if (Zydis_CONSIDERED_CONFIGS) find_package_handle_standard_args(Zydis CONFIG_MODE) else() find_package(PkgConfig QUIET) pkg_search_module(Zydis QUIET IMPORTED_TARGET zydis) find_package_handle_standard_args(Zydis REQUIRED_VARS zydis_LINK_LIBRARIES VERSION_VAR zydis_VERSION) if (TARGET PkgConfig::zydis) add_library(Zydis::Zydis ALIAS PkgConfig::zydis) endif() endif() ================================================ FILE: Data/CMake/Findxxhash.cmake ================================================ # SPDX-License-Identifier: MIT include(FindPackageHandleStandardArgs) find_package(PkgConfig QUIET) pkg_search_module(xxhash QUIET IMPORTED_TARGET xxhash libxxhash) find_package_handle_standard_args(xxhash REQUIRED_VARS xxhash_LINK_LIBRARIES VERSION_VAR xxhash_VERSION ) if (xxhash_FOUND AND NOT TARGET xxHash::xxhash) if (TARGET PkgConfig::xxhash) add_library(xxHash::xxhash ALIAS PkgConfig::xxhash) else() add_library(xxHash::xxhash ALIAS xxhash) endif() endif() ================================================ FILE: Data/CMake/LinkerGC.cmake ================================================ # SPDX-License-Identifier: MIT # This applies some common linker options that reduce code size and linking time in Release mode. Namely: # --gc-sections: Linktime garbage collection, discards unused sections from the final output # --strip-all : Similar to running `strip`, discards the symbol table from the final output # --as-needed : Only includes libraries that are actually needed in the final output. macro(LinkerGC target) if (CMAKE_BUILD_TYPE MATCHES "RELEASE") target_link_options(${target} PRIVATE "LINKER:--gc-sections" "LINKER:--strip-all" "LINKER:--as-needed") endif() endmacro() ================================================ FILE: Data/CMake/cmake_uninstall.cmake.in ================================================ if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt") message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt") endif() file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files) string(REGEX REPLACE "\n" ";" files "${files}") foreach(file ${files}) message(STATUS "Uninstalling $ENV{DESTDIR}${file}") if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") exec_program( "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" OUTPUT_VARIABLE rm_out RETURN_VALUE rm_retval ) if(NOT "${rm_retval}" STREQUAL 0) message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") endif() else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") message(STATUS "File $ENV{DESTDIR}${file} does not exist.") endif() endforeach() ================================================ FILE: Data/CMake/toolchain_aarch64.cmake ================================================ # This is a reference AArch64 cross compile script # Pass in to cmake when building: # eg: cmake --toolchain ../Data/CMake/toolchain_aarch64.cmake .. if (NOT DEFINED ENV{SYSROOT}) message(FATAL_ERROR "Need to have SYSROOT environment variable set") endif() set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR aarch64) set(CMAKE_CROSSCOMPILING TRUE) # Target triple needs to match the binutils exactly set(TARGET_TRIPLE aarch64-linux-gnu) set(CMAKE_C_COMPILER "clang") set(CMAKE_CXX_COMPILER "clang++") set(CMAKE_C_COMPILER_AR "llvm-ar") set(CMAKE_CXX_COMPILER_AR "llvm-ar") set(CMAKE_C_COMPILER_RANLIB "llvm-ranlib") set(CMAKE_CXX_COMPILER_RANLIB "llvm-ranlib") set(CMAKE_LINKER "ld.lld") set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE}) set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE}) # Set the environment variable SYSROOT to the aarch64 rootfs set(CMAKE_FIND_ROOT_PATH "$ENV{SYSROOT}") set(CMAKE_SYSROOT "$ENV{SYSROOT}") list(APPEND CMAKE_PREFIX_PATH "$ENV{SYSROOT}/usr/lib/${TARGET_TRIPLE}/cmake/") set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) ================================================ FILE: Data/CMake/toolchain_mingw.cmake ================================================ set(MINGW_TRIPLE "" CACHE STRING "MinGW compiler target architecture triple") set(CMAKE_RC_COMPILER ${MINGW_TRIPLE}-windres) set(CMAKE_C_COMPILER ${MINGW_TRIPLE}-clang) set(CMAKE_CXX_COMPILER ${MINGW_TRIPLE}-clang++) set(CMAKE_DLLTOOL ${MINGW_TRIPLE}-dlltool) set(CMAKE_AR ${MINGW_TRIPLE}-ar) # Compile everything as static to avoid requiring the MinGW runtime libraries, force page aligned sections so that # debug symbols work correctly, and disable loop alignment to workaround an LLVM bug # (https://github.com/llvm/llvm-project/issues/47432) set(CMAKE_SHARED_LINKER_FLAGS_INIT "-static -static-libgcc -static-libstdc++ -Wl,--file-alignment=4096,/mllvm:-align-loops=1") set(CMAKE_EXE_LINKER_FLAGS_INIT "-static -static-libgcc -static-libstdc++ -Wl,--file-alignment=4096,/mllvm:-align-loops=1") set(CMAKE_C_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) set(CMAKE_CXX_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) set(CMAKE_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) set(CMAKE_SYSTEM_NAME Windows) set(CMAKE_SYSTEM_PROCESSOR ${MINGW_TRIPLE}) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) ================================================ FILE: Data/CMake/toolchain_x86_32.cmake ================================================ option(ENABLE_CLANG_THUNKS "Enable building thunks with clang" FALSE) set(CMAKE_SYSTEM_PROCESSOR i686) if (ENABLE_CLANG_THUNKS) message(STATUS "Enabling thunk clang building. Force enabling LLD as well") set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_C_COMPILER clang) set(CMAKE_CXX_COMPILER clang++) set(CLANG_FLAGS "-target i686-linux-gnu -msse2 -mfpmath=sse") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CLANG_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CLANG_FLAGS}") else() set(CMAKE_C_COMPILER x86_64-linux-gnu-gcc -m32) set(CMAKE_CXX_COMPILER x86_64-linux-gnu-g++ -m32) endif() ================================================ FILE: Data/CMake/toolchain_x86_64.cmake ================================================ option(ENABLE_CLANG_THUNKS "Enable building thunks with clang" FALSE) set(CMAKE_SYSTEM_PROCESSOR x86_64) if (ENABLE_CLANG_THUNKS) message(STATUS "Enabling thunk clang building. Force enabling LLD as well") set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_C_COMPILER clang) set(CMAKE_CXX_COMPILER clang++) set(CLANG_FLAGS "-target x86_64-linux-gnu") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CLANG_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CLANG_FLAGS}") else() set(CMAKE_C_COMPILER x86_64-linux-gnu-gcc) set(CMAKE_CXX_COMPILER x86_64-linux-gnu-g++) endif() ================================================ FILE: Data/CMake/version_to_variables.cmake ================================================ # Extracts a version from the passed in version string in the form of "..". # If a part of the version is missing then it gets set as zero. # Version variables returned in: # ${Package}_VERSION_MAJOR # ${Package}_VERSION_MINOR # ${Package}_VERSION_PATCH function(version_to_variables VERSION _Package) string(REPLACE "." ";" VERSION_LIST "${VERSION}") list (LENGTH VERSION_LIST VERSION_LEN) if (${VERSION_LEN} GREATER 0) list(GET VERSION_LIST 0 VERSION_MAJOR) set(${_Package}_VERSION_MAJOR ${VERSION_MAJOR} PARENT_SCOPE) else() set(${_Package}_VERSION_MAJOR 0 PARENT_SCOPE) endif() if (${VERSION_LEN} GREATER 1) list(GET VERSION_LIST 1 VERSION_MINOR) set(${_Package}_VERSION_MINOR ${VERSION_MINOR} PARENT_SCOPE) else() set(${_Package}_VERSION_MINOR 0 PARENT_SCOPE) endif() if (${VERSION_LEN} GREATER 2) list(GET VERSION_LIST 2 VERSION_PATCH) set(${_Package}_VERSION_PATCH ${VERSION_PATCH} PARENT_SCOPE) else() set(${_Package}_VERSION_PATCH 0 PARENT_SCOPE) endif() endfunction() ================================================ FILE: Data/Dockerfile ================================================ # --- Stage 1: Builder --- FROM ubuntu:22.04 as builder RUN DEBIAN_FRONTEND="noninteractive" apt-get update RUN DEBIAN_FRONTEND="noninteractive" apt install -y cmake \ clang-13 llvm-13 nasm ninja-build pkg-config \ libcap-dev libglfw3-dev libepoxy-dev python3-dev libsdl2-dev \ python3 linux-headers-generic \ git qtbase5-dev qtdeclarative5-dev lld RUN git clone --recurse-submodules https://github.com/FEX-Emu/FEX.git WORKDIR /FEX RUN mkdir build ARG CC=clang-13 ARG CXX=clang++-13 RUN cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=Release -DUSE_LINKER=lld -DENABLE_LTO=True -DBUILD_TESTING=False -DENABLE_ASSERTIONS=False -G Ninja . RUN ninja WORKDIR /FEX/build # --- Stage 2: Runner --- FROM builder as runner RUN DEBIAN_FRONTEND="noninteractive" apt-get update RUN DEBIAN_FRONTEND="noninteractive" apt install -y \ libcap-dev libglfw3-dev libepoxy-dev COPY --from=builder /FEX/Bin/* /usr/bin/ WORKDIR / ================================================ FILE: Data/ThunksDB.json ================================================ { "DB": { "GL": { "Library" : "libGL-guest.so", "Overlay": [ "@PREFIX_LIB@/libGL.so", "@PREFIX_LIB@/libGL.so.1", "@PREFIX_LIB@/libGL.so.1.2.0", "@PREFIX_LIB@/libGL.so.1.7.0" ] }, "Vulkan": { "Library": "libvulkan-guest.so", "Overlay": [ "@PREFIX_LIB@/libvulkan.so", "@PREFIX_LIB@/libvulkan.so.1", "@HOME@/.local/share/Steam/ubuntu12_32/steam-runtime/pinned_libs_64/libvulkan.so.1" ] }, "drm": { "Library": "libdrm-guest.so", "Overlay": [ "@PREFIX_LIB@/libdrm.so", "@PREFIX_LIB@/libdrm.so.2", "@PREFIX_LIB@/libdrm.so.2.4.0" ] }, "asound": { "Library": "libasound-guest.so", "Overlay": [ "@PREFIX_LIB@/libasound.so", "@PREFIX_LIB@/libasound.so.2", "@PREFIX_LIB@/libasound.so.2.0.0" ] }, "fex_thunk_test": { "Library": "libfex_thunk_test-guest.so", "Overlay": [ "@PREFIX_LIB@/libfex_thunk_test.so" ] }, "WaylandClient": { "Library" : "libwayland-client-guest.so", "Overlay": [ "@PREFIX_LIB@/libwayland-client.so", "@PREFIX_LIB@/libwayland-client.so.0", "@PREFIX_LIB@/libwayland-client.so.0.20.0" ] } } } ================================================ FILE: Data/binfmts/CMakeLists.txt ================================================ function(GenBinFmt Name) # Get the filename only component get_filename_component(FMT_NAME ${Name} NAME_WE) # Configure it configure_file(${Name} ${CMAKE_BINARY_DIR}/Data/binfmts/${FMT_NAME}) # Then install the configured binfmt install(FILES ${CMAKE_BINARY_DIR}/Data/binfmts/${FMT_NAME} DESTINATION ${CMAKE_INSTALL_PREFIX}/share/binfmts/ COMPONENT Runtime) endfunction() if (NOT USE_LEGACY_BINFMTMISC) configure_file(FEX-x86.conf.in ${CMAKE_BINARY_DIR}/Data/binfmts/FEX-x86.conf) configure_file(FEX-x86_64.conf.in ${CMAKE_BINARY_DIR}/Data/binfmts/FEX-x86_64.conf) install(FILES ${CMAKE_BINARY_DIR}/Data/binfmts/FEX-x86.conf ${CMAKE_BINARY_DIR}/Data/binfmts/FEX-x86_64.conf DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/binfmt.d/ COMPONENT Runtime) else() GenBinFmt(FEX-x86.in) GenBinFmt(FEX-x86_64.in) endif() ================================================ FILE: Data/binfmts/FEX-x86.conf.in ================================================ :FEX-x86:M:0:\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x03\x00:\xff\xff\xff\xff\xff\xfe\xfe\x00\x00\x00\x00\xff\xff\xff\xff\xff\xfe\xff\xff\xff:@CMAKE_INSTALL_PREFIX@/bin/FEX:POCF ================================================ FILE: Data/binfmts/FEX-x86.in ================================================ package fex interpreter @CMAKE_INSTALL_PREFIX@/bin/FEX magic \x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x03\x00 offset 0 mask \xff\xff\xff\xff\xff\xfe\xfe\x00\x00\x00\x00\xff\xff\xff\xff\xff\xfe\xff\xff\xff credentials yes fix_binary yes preserve yes ================================================ FILE: Data/binfmts/FEX-x86_64.conf.in ================================================ :FEX-x86_64:M:0:\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x3e\x00:\xff\xff\xff\xff\xff\xfe\xfe\x00\x00\x00\x00\xff\xff\xff\xff\xff\xfe\xff\xff\xff:@CMAKE_INSTALL_PREFIX@/bin/FEX:POCF ================================================ FILE: Data/binfmts/FEX-x86_64.in ================================================ package fex interpreter @CMAKE_INSTALL_PREFIX@/bin/FEX magic \x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x3e\x00 offset 0 mask \xff\xff\xff\xff\xff\xfe\xfe\x00\x00\x00\x00\xff\xff\xff\xff\xff\xfe\xff\xff\xff credentials yes fix_binary yes preserve yes ================================================ FILE: Data/nix/FEXLinuxTests/shell.nix ================================================ { pkgs ? import { } }: let pkgsCross32 = pkgs.pkgsCross.gnu32; pkgsCross64 = pkgs.pkgsCross.gnu64; gcc32 = pkgs.writeText "toolchain_nix_gcc_x86_32.txt" '' set(CMAKE_SYSTEM_PROCESSOR i686) set(CMAKE_C_COMPILER ${pkgsCross32.buildPackages.gcc}/bin/i686-unknown-linux-gnu-gcc) set(CMAKE_CXX_COMPILER ${pkgsCross32.buildPackages.gcc}/bin/i686-unknown-linux-gnu-g++) ''; gcc64 = pkgs.writeText "toolchain_nix_gcc_x86_64.txt" '' set(CMAKE_SYSTEM_PROCESSOR x86_64) set(CMAKE_C_COMPILER ${pkgsCross64.buildPackages.gcc}/bin/x86_64-unknown-linux-gnu-gcc) set(CMAKE_CXX_COMPILER ${pkgsCross64.buildPackages.gcc}/bin/x86_64-unknown-linux-gnu-g++) ''; in pkgs.mkShell { buildInputs = [ pkgsCross64.buildPackages.clang pkgsCross32.buildPackages.clang ]; shellHook = '' if [[ $- == *i* ]]; then echo "toolchain32: ${gcc32}" echo "toolchain64: ${gcc64}" echo "" echo "Use \$FEX_CMAKE_TOOLCHAINS to configure CMake." fi ''; FEX_CMAKE_TOOLCHAINS = "-DX86_32_TOOLCHAIN_FILE=${gcc32} -DX86_64_TOOLCHAIN_FILE=${gcc64}"; } ================================================ FILE: Data/nix/LibraryForwarding/shell.nix ================================================ { pkgs ? import { } }: let pkgsCross32 = pkgs.pkgsCross.gnu32; pkgsCross64 = pkgs.pkgsCross.gnu64; devRootFS = pkgs.buildEnv { name = "fex-dev-rootfs"; paths = [ pkgsCross64.stdenv.cc.libc_dev pkgsCross32.stdenv.cc.libc_dev pkgsCross64.stdenv.cc.cc pkgsCross32.stdenv.cc.cc pkgs.alsa-lib.dev pkgs.libdrm.dev pkgs.libGL.dev pkgs.wayland.dev pkgs.xorg.libX11.dev pkgs.xorg.libxcb.dev pkgs.xorg.libXrandr.dev pkgs.xorg.libXrender.dev pkgs.xorg.xorgproto ]; ignoreCollisions = true; pathsToLink = [ "/include" "/lib" ]; postBuild = '' mkdir -p $out/usr ln -s $out/include $out/usr/ ''; }; toolchain32 = pkgs.writeText "toolchain_nix_x86_32.txt" '' set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_SYSTEM_PROCESSOR i686) set(CMAKE_C_COMPILER clang) set(CMAKE_CXX_COMPILER clang++) set(CMAKE_C_COMPILER ${pkgsCross32.buildPackages.clang}/bin/i686-unknown-linux-gnu-clang) set(CMAKE_CXX_COMPILER ${pkgsCross32.buildPackages.clang}/bin/i686-unknown-linux-gnu-clang++) set(CLANG_FLAGS "-nodefaultlibs -nostartfiles -lstdc++ -target i686-linux-gnu -msse2 -mfpmath=sse --sysroot=${devRootFS} -iwithsysroot/include") set(CMAKE_C_FLAGS "''${CMAKE_C_FLAGS} ''${CLANG_FLAGS}") set(CMAKE_CXX_FLAGS "''${CMAKE_CXX_FLAGS} ''${CLANG_FLAGS}") ''; toolchain64 = pkgs.writeText "toolchain_nix_x86_64.txt" '' set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld") set(CMAKE_SYSTEM_PROCESSOR x86_64) set(CMAKE_C_COMPILER clang) set(CMAKE_CXX_COMPILER clang++) set(CMAKE_C_COMPILER ${pkgsCross64.buildPackages.clang}/bin/x86_64-unknown-linux-gnu-clang) set(CMAKE_CXX_COMPILER ${pkgsCross64.buildPackages.clang}/bin/x86_64-unknown-linux-gnu-clang++) set(CLANG_FLAGS "-nodefaultlibs -nostartfiles -lstdc++ -target x86_64-linux-gnu --sysroot=${devRootFS} -iwithsysroot/usr/include") set(CMAKE_C_FLAGS "''${CMAKE_C_FLAGS} ''${CLANG_FLAGS}") set(CMAKE_CXX_FLAGS "''${CMAKE_CXX_FLAGS} ''${CLANG_FLAGS}") ''; in pkgs.mkShell { buildInputs = [ pkgsCross64.buildPackages.clang pkgsCross32.buildPackages.clang ]; shellHook = '' if [[ $- == *i* ]]; then echo "Set up dev RootFS at ${devRootFS}" echo "toolchain32: ${toolchain32}" echo "toolchain64: ${toolchain64}" echo "" echo "Use \$FEX_CMAKE_TOOLCHAINS to configure CMake." fi ''; FEX_CMAKE_TOOLCHAINS = "-DX86_32_TOOLCHAIN_FILE=${toolchain32} -DX86_64_TOOLCHAIN_FILE=${toolchain64} -DX86_DEV_ROOTFS=${devRootFS}"; ROOTFS = "${devRootFS}"; } ================================================ FILE: Data/nix/WineOnArm/shell.nix ================================================ { pkgs ? import { } }: let toolchain = pkgs.fetchzip { url = "https://github.com/bylaws/llvm-mingw/releases/download/20250920/llvm-mingw-20250920-ucrt-ubuntu-22.04-aarch64.tar.xz"; sha256 = "sha256-LaojKjC8KzY+soW5u6eoDoXE3qtYk9Ejr7M3enTqRAE="; }; cmakeToolchainFile = pkgs.substitute { # Use absolute paths that are discoverable outside of the nix shell src = ../../CMake/toolchain_mingw.cmake; substitutions = ["--replace-fail" "\${MINGW_TRIPLE}-" "${toolchain}/bin/\${MINGW_TRIPLE}-"]; }; mesonCrossFile = pkgs.writeText "crossfile_llvm_mingw.txt" '' [binaries] ar = '${toolchain}/bin/arm64ec-w64-mingw32-ar' c = '${toolchain}/bin/arm64ec-w64-mingw32-gcc' cpp = '${toolchain}/bin/arm64ec-w64-mingw32-g++' ld = '${toolchain}/bin/arm64ec-w64-mingw32-ld' windres = '${toolchain}/bin/arm64ec-w64-mingw32-windres' strip = '${toolchain}/bin/strip' widl = '${toolchain}/bin/arm64ec-w64-mingw32-widl' pkgconfig = 'aarch64-linux-gnu-pkg-config' [host_machine] system = 'windows' cpu_family = 'aarch64' cpu = 'aarch64' endian = 'little' ''; in pkgs.mkShell { buildInputs = [ toolchain ]; shellHook = '' if [[ $- == *i* ]]; then echo "llvm-mingw set up at ${toolchain}." echo "" echo "To configure DXVK/vkd3d-proton: meson setup \$FEX_MESON_CROSSFILE" echo "" echo "To configure 32-bit FEX build: cmake \$FEX_CMAKE_TOOLCHAIN_WOW64" echo "To configure 64-bit FEX build: cmake \$FEX_CMAKE_TOOLCHAIN_ARM64EC" fi ''; # E.g. cmake $FEX_CMAKE_TOOLCHAIN_ARM64EC -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_LTO=False -DBUILD_TESTING=False FEX_CMAKE_TOOLCHAIN_ARM64EC = "--toolchain ${cmakeToolchainFile} -DMINGW_TRIPLE=arm64ec-w64-mingw32 -DCMAKE_INSTALL_LIBDIR=/usr/lib/wine/aarch64-windows"; FEX_CMAKE_TOOLCHAIN_WOW64 = "--toolchain ${cmakeToolchainFile} -DMINGW_TRIPLE=aarch64-w64-mingw32 -DCMAKE_INSTALL_LIBDIR=/usr/lib/wine/aarch64-windows"; FEX_MESON_CROSSFILE = "--cross-file ${mesonCrossFile}"; } ================================================ FILE: Data/nix/cmake_configure_woa32.sh ================================================ #! /usr/bin/env nix-shell #! nix-shell -i bash WineOnArm/shell.nix # Helper script to configure CMake for building FEX as library for emulation # of 32-bit applications in Wine/Proton. # The required cross-toolchains will be set up and managed by nix. if [ $# -eq 0 ] then echo "Expected CMake argument list" exit 1 fi if [ -f CMakeCache.txt ] then echo "Expected empty build folder" exit 1 fi set -o xtrace cmake $FEX_CMAKE_TOOLCHAIN_WOW64 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_LTO=False -DBUILD_TESTING=False $@ ================================================ FILE: Data/nix/cmake_configure_woa64.sh ================================================ #! /usr/bin/env nix-shell #! nix-shell -i bash WineOnArm/shell.nix # Helper script to configure CMake for building FEX as library for emulation # of 64-bit applications in Wine/Proton # Nix is used to install and manage the required cross-toolchains. if [ $# -eq 0 ] then echo "Expected CMake argument list" exit 1 fi if [ -f CMakeCache.txt ] then echo "Expected empty build folder" exit 1 fi set -o xtrace cmake $FEX_CMAKE_TOOLCHAIN_ARM64EC -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_LTO=False -DBUILD_TESTING=False $@ ================================================ FILE: Data/nix/cmake_enable_flt.sh ================================================ #! /usr/bin/env nix-shell #! nix-shell -i bash FEXLinuxTests/shell.nix # Helper script to configure CMake for building FEXLinuxTests. # Nix is used to install and manage the required cross-toolchains. if [ ! -f CMakeCache.txt ] then echo "Must be run from a pre-configured CMake build folder" exit 1 fi # Remove previous build to ensure the new toolchain is applied rm -rf unittests/FEXLinuxTests set -o xtrace cmake . $FEX_CMAKE_TOOLCHAINS -DBUILD_TESTING=ON -DBUILD_FEX_LINUX_TESTS=ON ================================================ FILE: Data/nix/cmake_enable_libfwd.sh ================================================ # Helper script to configure CMake for library forwarding in FEX. # Nix is used to install and manage the required cross-toolchains. if [ ! -f CMakeCache.txt ] then echo "Must be run from a pre-configured CMake build folder" exit 1 fi # Remove previous build to ensure the new toolchain is applied rm -rf guest-libs guest-libs-32 Guest Guest_32 # Set clang executable path manually since the one from the nix store # will be picked up otherwise CLANG_EXEC_PATH="" if ! grep -q CLANG_EXEC_PATH CMakeCache.txt then CLANG_EXEC_PATH="-DCLANG_EXEC_PATH=`which clang`" fi nix-shell `dirname -- "$0"`/LibraryForwarding/shell.nix \ --run "set -o xtrace; cmake . \$FEX_CMAKE_TOOLCHAINS -DBUILD_THUNKS=ON $CLANG_EXEC_PATH; set +o xtrace" ================================================ FILE: External/.clang-format ================================================ DisableFormat: true ================================================ FILE: External/SoftFloat-3e/CMakeLists.txt ================================================ add_library(softfloat_3e STATIC # F80 support src/extF80_add.c src/extF80_div.c src/extF80_sub.c src/extF80_mul.c src/extF80_rem.c src/extF80_sqrt.c src/extF80_le.c src/extF80_to_i32.c src/extF80_to_i64.c src/extF80_to_ui64.c src/extF80_to_f32.c src/extF80_to_f64.c src/i32_to_extF80.c src/ui64_to_extF80.c src/extF80_to_f128.c src/f128_to_extF80.c # F128 support src/f128_add.c src/f128_div.c src/f128_eq.c src/f128_eq_signaling.c src/f128_isSignalingNaN.c src/f128_le.c src/f128_le_quiet.c src/f128_lt.c src/f128_lt_quiet.c src/f128_mulAdd.c src/f128_mul.c src/f128_rem.c src/f128_sqrt.c src/f128_sub.c src/f128_to_f16.c src/f128_to_f32.c src/f128_to_f64.c src/f128_to_i32.c src/f128_to_i64.c src/f128_to_ui32.c src/f128_to_ui64.c src/s_addMagsF128.c src/s_subMagsF128.c src/s_normRoundPackToF128.c src/s_roundPackToF128.c src/s_propagateNaNF128UI.c # Conversion src/f32_to_f128.c src/i32_to_f128.c src/s_roundToUI64.c src/s_f128UIToCommonNaN.c src/s_commonNaNToF128UI.c src/s_normSubnormalF128Sig.c src/s_roundToI32.c src/s_roundToI64.c src/s_roundPackToF32.c src/s_addMagsExtF80.c src/s_extF80UIToCommonNaN.c src/s_commonNaNToF32UI.c src/s_commonNaNToF64UI.c src/s_roundPackToF64.c src/s_propagateNaNExtF80UI.c src/s_roundPackToExtF80.c src/s_normSubnormalExtF80Sig.c src/s_subMagsExtF80.c src/s_shiftRightJam128.c src/s_shiftRightJam128Extra.c src/s_normRoundPackToExtF80.c src/s_approxRecip_1Ks.c src/s_approxRecipSqrt32_1.c src/s_approxRecipSqrt_1Ks.c src/softfloat_raiseFlags.c src/f64_to_extF80.c src/s_commonNaNToExtF80UI.c src/s_normSubnormalF64Sig.c src/s_f64UIToCommonNaN.c src/extF80_roundToInt.c src/extF80_eq.c src/extF80_lt.c src/f32_to_extF80.c src/s_normSubnormalF32Sig.c src/s_f32UIToCommonNaN.c) if (ARCHITECTURE_arm64 AND HAS_CLANG_PRESERVE_ALL) list(APPEND DEFINES "-DFEXCORE_PRESERVE_ALL_ATTR=__attribute__((preserve_all));-DFEXCORE_HAS_PRESERVE_ALL_ATTR=1") else() list(APPEND DEFINES "-DFEXCORE_PRESERVE_ALL_ATTR=;-DFEXCORE_HAS_PRESERVE_ALL_ATTR=0") endif() list(APPEND DEFINES "-DSOFTFLOAT_BUILTIN_CLZ=1;-DINLINE=static inline;-DINLINE_LEVEL=4;-DSOFTFLOAT_FAST_INT64=1;-DSOFTFLOAT_FAST_DIV32TO16=1;-DSOFTFLOAT_FAST_DIV64TO32=1") target_include_directories(softfloat_3e PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/) target_include_directories(softfloat_3e PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/SoftFloat-3e/) target_compile_definitions(softfloat_3e PUBLIC ${DEFINES}) ================================================ FILE: External/SoftFloat-3e/include/SoftFloat-3e/opts-GCC.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #ifndef opts_GCC_h #define opts_GCC_h 1 #ifdef INLINE #include #include "primitiveTypes.h" #ifdef SOFTFLOAT_BUILTIN_CLZ INLINE uint_fast8_t softfloat_countLeadingZeros16( uint16_t a ) { return a ? __builtin_clz( a ) - 16 : 16; } #define softfloat_countLeadingZeros16 softfloat_countLeadingZeros16 INLINE uint_fast8_t softfloat_countLeadingZeros32( uint32_t a ) { return a ? __builtin_clz( a ) : 32; } #define softfloat_countLeadingZeros32 softfloat_countLeadingZeros32 INLINE uint_fast8_t softfloat_countLeadingZeros64( uint64_t a ) { return a ? __builtin_clzll( a ) : 64; } #define softfloat_countLeadingZeros64 softfloat_countLeadingZeros64 #endif #ifdef SOFTFLOAT_INTRINSIC_INT128 INLINE struct uint128 softfloat_mul64ByShifted32To128( uint64_t a, uint32_t b ) { union { unsigned __int128 ui; struct uint128 s; } uZ; uZ.ui = (unsigned __int128) a * ((uint_fast64_t) b<<32); return uZ.s; } #define softfloat_mul64ByShifted32To128 softfloat_mul64ByShifted32To128 INLINE struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ) { union { unsigned __int128 ui; struct uint128 s; } uZ; uZ.ui = (unsigned __int128) a * b; return uZ.s; } #define softfloat_mul64To128 softfloat_mul64To128 INLINE struct uint128 softfloat_mul128By32( uint64_t a64, uint64_t a0, uint32_t b ) { union { unsigned __int128 ui; struct uint128 s; } uZ; uZ.ui = ((unsigned __int128) a64<<64 | a0) * b; return uZ.s; } #define softfloat_mul128By32 softfloat_mul128By32 INLINE void softfloat_mul128To256M( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0, uint64_t *zPtr ) { unsigned __int128 z0, mid1, mid, z128; z0 = (unsigned __int128) a0 * b0; mid1 = (unsigned __int128) a64 * b0; mid = mid1 + (unsigned __int128) a0 * b64; z128 = (unsigned __int128) a64 * b64; z128 += (unsigned __int128) (mid < mid1)<<64 | mid>>64; mid <<= 64; z0 += mid; z128 += (z0 < mid); zPtr[indexWord( 4, 0 )] = z0; zPtr[indexWord( 4, 1 )] = z0>>64; zPtr[indexWord( 4, 2 )] = z128; zPtr[indexWord( 4, 3 )] = z128>>64; } #define softfloat_mul128To256M softfloat_mul128To256M #endif #endif #endif ================================================ FILE: External/SoftFloat-3e/include/SoftFloat-3e/platform.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define LITTLEENDIAN 1 /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define SOFTFLOAT_BUILTIN_CLZ 1 #define SOFTFLOAT_INTRINSIC_INT128 1 #define SOFTFLOAT_FAST_INT64 1 #include "opts-GCC.h" ================================================ FILE: External/SoftFloat-3e/include/SoftFloat-3e/primitiveTypes.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #ifndef primitiveTypes_h #define primitiveTypes_h 1 #include #ifdef SOFTFLOAT_FAST_INT64 #ifdef LITTLEENDIAN struct uint128 { uint64_t v0, v64; }; struct uint64_extra { uint64_t extra, v; }; struct uint128_extra { uint64_t extra; struct uint128 v; }; #else struct uint128 { uint64_t v64, v0; }; struct uint64_extra { uint64_t v, extra; }; struct uint128_extra { struct uint128 v; uint64_t extra; }; #endif #endif /*---------------------------------------------------------------------------- | These macros are used to isolate the differences in word order between big- | endian and little-endian platforms. *----------------------------------------------------------------------------*/ #ifdef LITTLEENDIAN #define wordIncr 1 #define indexWord( total, n ) (n) #define indexWordHi( total ) ((total) - 1) #define indexWordLo( total ) 0 #define indexMultiword( total, m, n ) (n) #define indexMultiwordHi( total, n ) ((total) - (n)) #define indexMultiwordLo( total, n ) 0 #define indexMultiwordHiBut( total, n ) (n) #define indexMultiwordLoBut( total, n ) 0 #define INIT_UINTM4( v3, v2, v1, v0 ) { v0, v1, v2, v3 } #else #define wordIncr -1 #define indexWord( total, n ) ((total) - 1 - (n)) #define indexWordHi( total ) 0 #define indexWordLo( total ) ((total) - 1) #define indexMultiword( total, m, n ) ((total) - 1 - (m)) #define indexMultiwordHi( total, n ) 0 #define indexMultiwordLo( total, n ) ((total) - (n)) #define indexMultiwordHiBut( total, n ) 0 #define indexMultiwordLoBut( total, n ) (n) #define INIT_UINTM4( v3, v2, v1, v0 ) { v3, v2, v1, v0 } #endif #endif ================================================ FILE: External/SoftFloat-3e/include/SoftFloat-3e/softfloat.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ /*============================================================================ | Note: If SoftFloat is made available as a general library for programs to | use, it is strongly recommended that a platform-specific version of this | header, "softfloat.h", be created that folds in "softfloat_types.h" and that | eliminates all dependencies on compile-time macros. *============================================================================*/ #ifndef softfloat_h #define softfloat_h 1 #include #include #include "softfloat_types.h" /*---------------------------------------------------------------------------- | Routine to raise any or all of the software floating-point exception flags. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_raiseFlags( struct softfloat_state *, uint_fast8_t ); /*---------------------------------------------------------------------------- | Integer-to-floating-point conversion routines. *----------------------------------------------------------------------------*/ float16_t ui32_to_f16( uint32_t ); float32_t ui32_to_f32( uint32_t ); float64_t ui32_to_f64( uint32_t ); #ifdef SOFTFLOAT_FAST_INT64 extFloat80_t ui32_to_extF80( uint32_t ); float128_t ui32_to_f128( uint32_t ); #endif void ui32_to_extF80M( uint32_t, extFloat80_t * ); void ui32_to_f128M( uint32_t, float128_t * ); float16_t ui64_to_f16( uint64_t ); float32_t ui64_to_f32( uint64_t ); float64_t ui64_to_f64( uint64_t ); #ifdef SOFTFLOAT_FAST_INT64 FEXCORE_PRESERVE_ALL_ATTR extFloat80_t ui64_to_extF80( uint64_t ); float128_t ui64_to_f128( uint64_t ); #endif void ui64_to_extF80M( uint64_t, extFloat80_t * ); void ui64_to_f128M( uint64_t, float128_t * ); float16_t i32_to_f16( int32_t ); float32_t i32_to_f32( int32_t ); float64_t i32_to_f64( int32_t ); #ifdef SOFTFLOAT_FAST_INT64 FEXCORE_PRESERVE_ALL_ATTR extFloat80_t i32_to_extF80( int32_t ); float128_t i32_to_f128( int32_t ); #endif void i32_to_extF80M( int32_t, extFloat80_t * ); void i32_to_f128M( int32_t, float128_t * ); float16_t i64_to_f16( int64_t ); float32_t i64_to_f32( int64_t ); float64_t i64_to_f64( int64_t ); #ifdef SOFTFLOAT_FAST_INT64 extFloat80_t i64_to_extF80( int64_t ); float128_t i64_to_f128( int64_t ); #endif void i64_to_extF80M( int64_t, extFloat80_t * ); void i64_to_f128M( int64_t, float128_t * ); /*---------------------------------------------------------------------------- | 16-bit (half-precision) floating-point operations. *----------------------------------------------------------------------------*/ uint_fast32_t f16_to_ui32( float16_t, uint_fast8_t, bool ); uint_fast64_t f16_to_ui64( float16_t, uint_fast8_t, bool ); int_fast32_t f16_to_i32( float16_t, uint_fast8_t, bool ); int_fast64_t f16_to_i64( float16_t, uint_fast8_t, bool ); uint_fast32_t f16_to_ui32_r_minMag( float16_t, bool ); uint_fast64_t f16_to_ui64_r_minMag( float16_t, bool ); int_fast32_t f16_to_i32_r_minMag( float16_t, bool ); int_fast64_t f16_to_i64_r_minMag( float16_t, bool ); float32_t f16_to_f32( float16_t ); float64_t f16_to_f64( float16_t ); #ifdef SOFTFLOAT_FAST_INT64 extFloat80_t f16_to_extF80( float16_t ); float128_t f16_to_f128( float16_t ); #endif void f16_to_extF80M( float16_t, extFloat80_t * ); void f16_to_f128M( float16_t, float128_t * ); float16_t f16_roundToInt( float16_t, uint_fast8_t, bool ); float16_t f16_add( float16_t, float16_t ); float16_t f16_sub( float16_t, float16_t ); float16_t f16_mul( float16_t, float16_t ); float16_t f16_mulAdd( float16_t, float16_t, float16_t ); float16_t f16_div( float16_t, float16_t ); float16_t f16_rem( float16_t, float16_t ); float16_t f16_sqrt( float16_t ); bool f16_eq( float16_t, float16_t ); bool f16_le( float16_t, float16_t ); bool f16_lt( float16_t, float16_t ); bool f16_eq_signaling( float16_t, float16_t ); bool f16_le_quiet( float16_t, float16_t ); bool f16_lt_quiet( float16_t, float16_t ); bool f16_isSignalingNaN( float16_t ); /*---------------------------------------------------------------------------- | 32-bit (single-precision) floating-point operations. *----------------------------------------------------------------------------*/ uint_fast32_t f32_to_ui32( float32_t, uint_fast8_t, bool ); uint_fast64_t f32_to_ui64( float32_t, uint_fast8_t, bool ); int_fast32_t f32_to_i32( float32_t, uint_fast8_t, bool ); int_fast64_t f32_to_i64( float32_t, uint_fast8_t, bool ); uint_fast32_t f32_to_ui32_r_minMag( float32_t, bool ); uint_fast64_t f32_to_ui64_r_minMag( float32_t, bool ); int_fast32_t f32_to_i32_r_minMag( float32_t, bool ); int_fast64_t f32_to_i64_r_minMag( float32_t, bool ); float16_t f32_to_f16( float32_t ); float64_t f32_to_f64( float32_t ); #ifdef SOFTFLOAT_FAST_INT64 FEXCORE_PRESERVE_ALL_ATTR extFloat80_t f32_to_extF80( struct softfloat_state *, float32_t ); float128_t f32_to_f128( struct softfloat_state *, float32_t ); #endif void f32_to_extF80M( float32_t, extFloat80_t * ); void f32_to_f128M( float32_t, float128_t * ); float32_t f32_roundToInt( float32_t, uint_fast8_t, bool ); float32_t f32_add( float32_t, float32_t ); float32_t f32_sub( float32_t, float32_t ); float32_t f32_mul( float32_t, float32_t ); float32_t f32_mulAdd( float32_t, float32_t, float32_t ); float32_t f32_div( float32_t, float32_t ); float32_t f32_rem( float32_t, float32_t ); float32_t f32_sqrt( float32_t ); bool f32_eq( float32_t, float32_t ); bool f32_le( float32_t, float32_t ); bool f32_lt( float32_t, float32_t ); bool f32_eq_signaling( float32_t, float32_t ); bool f32_le_quiet( float32_t, float32_t ); bool f32_lt_quiet( float32_t, float32_t ); bool f32_isSignalingNaN( float32_t ); /*---------------------------------------------------------------------------- | 64-bit (double-precision) floating-point operations. *----------------------------------------------------------------------------*/ uint_fast32_t f64_to_ui32( float64_t, uint_fast8_t, bool ); uint_fast64_t f64_to_ui64( float64_t, uint_fast8_t, bool ); int_fast32_t f64_to_i32( float64_t, uint_fast8_t, bool ); int_fast64_t f64_to_i64( float64_t, uint_fast8_t, bool ); uint_fast32_t f64_to_ui32_r_minMag( float64_t, bool ); uint_fast64_t f64_to_ui64_r_minMag( float64_t, bool ); int_fast32_t f64_to_i32_r_minMag( float64_t, bool ); int_fast64_t f64_to_i64_r_minMag( float64_t, bool ); float16_t f64_to_f16( float64_t ); float32_t f64_to_f32( float64_t ); #ifdef SOFTFLOAT_FAST_INT64 FEXCORE_PRESERVE_ALL_ATTR extFloat80_t f64_to_extF80( struct softfloat_state *, float64_t ); float128_t f64_to_f128( float64_t ); #endif void f64_to_extF80M( float64_t, extFloat80_t * ); void f64_to_f128M( float64_t, float128_t * ); float64_t f64_roundToInt( float64_t, uint_fast8_t, bool ); float64_t f64_add( float64_t, float64_t ); float64_t f64_sub( float64_t, float64_t ); float64_t f64_mul( float64_t, float64_t ); float64_t f64_mulAdd( float64_t, float64_t, float64_t ); float64_t f64_div( float64_t, float64_t ); float64_t f64_rem( float64_t, float64_t ); float64_t f64_sqrt( float64_t ); bool f64_eq( float64_t, float64_t ); bool f64_le( float64_t, float64_t ); bool f64_lt( float64_t, float64_t ); bool f64_eq_signaling( float64_t, float64_t ); bool f64_le_quiet( float64_t, float64_t ); bool f64_lt_quiet( float64_t, float64_t ); bool f64_isSignalingNaN( float64_t ); /*---------------------------------------------------------------------------- | 80-bit extended double-precision floating-point operations. *----------------------------------------------------------------------------*/ #ifdef SOFTFLOAT_FAST_INT64 uint_fast32_t extF80_to_ui32( extFloat80_t, uint_fast8_t, bool ); FEXCORE_PRESERVE_ALL_ATTR uint_fast64_t extF80_to_ui64( struct softfloat_state *, extFloat80_t, uint_fast8_t, bool ); FEXCORE_PRESERVE_ALL_ATTR int_fast32_t extF80_to_i32( struct softfloat_state *, extFloat80_t, uint_fast8_t, bool ); FEXCORE_PRESERVE_ALL_ATTR int_fast64_t extF80_to_i64( struct softfloat_state *, extFloat80_t, uint_fast8_t, bool ); uint_fast32_t extF80_to_ui32_r_minMag( extFloat80_t, bool ); uint_fast64_t extF80_to_ui64_r_minMag( extFloat80_t, bool ); int_fast32_t extF80_to_i32_r_minMag( extFloat80_t, bool ); int_fast64_t extF80_to_i64_r_minMag( extFloat80_t, bool ); float16_t extF80_to_f16( extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR float32_t extF80_to_f32( struct softfloat_state *, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR float64_t extF80_to_f64( struct softfloat_state *, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR float128_t extF80_to_f128( struct softfloat_state *, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_roundToInt( struct softfloat_state *, extFloat80_t, uint_fast8_t, bool ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_add( struct softfloat_state *, extFloat80_t, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_sub( struct softfloat_state *, extFloat80_t, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_mul( struct softfloat_state *, extFloat80_t, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_div( struct softfloat_state *, extFloat80_t, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_rem( struct softfloat_state *, extFloat80_t, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_sqrt( struct softfloat_state *, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR bool extF80_eq( struct softfloat_state *, extFloat80_t, extFloat80_t ); bool extF80_le( struct softfloat_state *, extFloat80_t, extFloat80_t ); FEXCORE_PRESERVE_ALL_ATTR bool extF80_lt( struct softfloat_state *, extFloat80_t, extFloat80_t ); bool extF80_eq_signaling( extFloat80_t, extFloat80_t ); bool extF80_le_quiet( extFloat80_t, extFloat80_t ); bool extF80_lt_quiet( extFloat80_t, extFloat80_t ); bool extF80_isSignalingNaN( extFloat80_t ); static inline extFloat80_t extF80_complement_sign(extFloat80_t a) { a.signExp ^= 1ULL << 15; return a; } #endif uint_fast32_t extF80M_to_ui32( const extFloat80_t *, uint_fast8_t, bool ); uint_fast64_t extF80M_to_ui64( const extFloat80_t *, uint_fast8_t, bool ); int_fast32_t extF80M_to_i32( const extFloat80_t *, uint_fast8_t, bool ); int_fast64_t extF80M_to_i64( const extFloat80_t *, uint_fast8_t, bool ); uint_fast32_t extF80M_to_ui32_r_minMag( const extFloat80_t *, bool ); uint_fast64_t extF80M_to_ui64_r_minMag( const extFloat80_t *, bool ); int_fast32_t extF80M_to_i32_r_minMag( const extFloat80_t *, bool ); int_fast64_t extF80M_to_i64_r_minMag( const extFloat80_t *, bool ); float16_t extF80M_to_f16( const extFloat80_t * ); float32_t extF80M_to_f32( const extFloat80_t * ); float64_t extF80M_to_f64( const extFloat80_t * ); void extF80M_to_f128M( const extFloat80_t *, float128_t * ); void extF80M_roundToInt( const extFloat80_t *, uint_fast8_t, bool, extFloat80_t * ); void extF80M_add( const extFloat80_t *, const extFloat80_t *, extFloat80_t * ); void extF80M_sub( const extFloat80_t *, const extFloat80_t *, extFloat80_t * ); void extF80M_mul( const extFloat80_t *, const extFloat80_t *, extFloat80_t * ); void extF80M_div( const extFloat80_t *, const extFloat80_t *, extFloat80_t * ); void extF80M_rem( const extFloat80_t *, const extFloat80_t *, extFloat80_t * ); void extF80M_sqrt( const extFloat80_t *, extFloat80_t * ); bool extF80M_eq( const extFloat80_t *, const extFloat80_t * ); bool extF80M_le( const extFloat80_t *, const extFloat80_t * ); bool extF80M_lt( const extFloat80_t *, const extFloat80_t * ); bool extF80M_eq_signaling( const extFloat80_t *, const extFloat80_t * ); bool extF80M_le_quiet( const extFloat80_t *, const extFloat80_t * ); bool extF80M_lt_quiet( const extFloat80_t *, const extFloat80_t * ); bool extF80M_isSignalingNaN( const extFloat80_t * ); /*---------------------------------------------------------------------------- | 128-bit (quadruple-precision) floating-point operations. *----------------------------------------------------------------------------*/ #ifdef SOFTFLOAT_FAST_INT64 uint_fast32_t f128_to_ui32( struct softfloat_state *, float128_t, uint_fast8_t, bool ); uint_fast64_t f128_to_ui64( struct softfloat_state *, float128_t, uint_fast8_t, bool ); int_fast32_t f128_to_i32( struct softfloat_state *, float128_t, uint_fast8_t, bool ); int_fast64_t f128_to_i64( struct softfloat_state *, float128_t, uint_fast8_t, bool ); uint_fast32_t f128_to_ui32_r_minMag( float128_t, bool ); uint_fast64_t f128_to_ui64_r_minMag( float128_t, bool ); int_fast32_t f128_to_i32_r_minMag( float128_t, bool ); int_fast64_t f128_to_i64_r_minMag( float128_t, bool ); float16_t f128_to_f16( struct softfloat_state *, float128_t ); float32_t f128_to_f32( struct softfloat_state *, float128_t ); float64_t f128_to_f64( struct softfloat_state *, float128_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t f128_to_extF80( struct softfloat_state *, float128_t ); float128_t f128_roundToInt( float128_t, uint_fast8_t, bool ); float128_t f128_add( struct softfloat_state *, float128_t, float128_t ); float128_t f128_sub( struct softfloat_state *, float128_t, float128_t ); float128_t f128_mul( struct softfloat_state *, float128_t, float128_t ); float128_t f128_mulAdd( struct softfloat_state *, float128_t, float128_t, float128_t ); float128_t f128_div( struct softfloat_state *, float128_t, float128_t ); float128_t f128_rem( struct softfloat_state *, float128_t, float128_t ); float128_t f128_sqrt( struct softfloat_state *, float128_t ); bool f128_eq( struct softfloat_state *, float128_t, float128_t ); bool f128_le( struct softfloat_state *, float128_t, float128_t ); bool f128_lt( struct softfloat_state *, float128_t, float128_t ); bool f128_eq_signaling( struct softfloat_state *, float128_t, float128_t ); bool f128_le_quiet( struct softfloat_state *, float128_t, float128_t ); bool f128_lt_quiet( struct softfloat_state *, float128_t, float128_t ); bool f128_isSignalingNaN( float128_t ); static inline float128_t f128_complement_sign(float128_t a) { a.v[1] ^= 1ULL << 63; return a; } #endif uint_fast32_t f128M_to_ui32( const float128_t *, uint_fast8_t, bool ); uint_fast64_t f128M_to_ui64( const float128_t *, uint_fast8_t, bool ); int_fast32_t f128M_to_i32( const float128_t *, uint_fast8_t, bool ); int_fast64_t f128M_to_i64( const float128_t *, uint_fast8_t, bool ); uint_fast32_t f128M_to_ui32_r_minMag( const float128_t *, bool ); uint_fast64_t f128M_to_ui64_r_minMag( const float128_t *, bool ); int_fast32_t f128M_to_i32_r_minMag( const float128_t *, bool ); int_fast64_t f128M_to_i64_r_minMag( const float128_t *, bool ); float16_t f128M_to_f16( const float128_t * ); float32_t f128M_to_f32( const float128_t * ); float64_t f128M_to_f64( const float128_t * ); void f128M_to_extF80M( const float128_t *, extFloat80_t * ); void f128M_roundToInt( const float128_t *, uint_fast8_t, bool, float128_t * ); void f128M_add( const float128_t *, const float128_t *, float128_t * ); void f128M_sub( const float128_t *, const float128_t *, float128_t * ); void f128M_mul( const float128_t *, const float128_t *, float128_t * ); void f128M_mulAdd( const float128_t *, const float128_t *, const float128_t *, float128_t * ); void f128M_div( const float128_t *, const float128_t *, float128_t * ); void f128M_rem( const float128_t *, const float128_t *, float128_t * ); void f128M_sqrt( const float128_t *, float128_t * ); bool f128M_eq( const float128_t *, const float128_t * ); bool f128M_le( const float128_t *, const float128_t * ); bool f128M_lt( const float128_t *, const float128_t * ); bool f128M_eq_signaling( const float128_t *, const float128_t * ); bool f128M_le_quiet( const float128_t *, const float128_t * ); bool f128M_lt_quiet( const float128_t *, const float128_t * ); bool f128M_isSignalingNaN( const float128_t * ); #endif ================================================ FILE: External/SoftFloat-3e/include/SoftFloat-3e/softfloat_types.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #ifndef softfloat_types_h #define softfloat_types_h 1 #include /*---------------------------------------------------------------------------- | Types used to pass 16-bit, 32-bit, 64-bit, and 128-bit floating-point | arguments and results to/from functions. These types must be exactly | 16 bits, 32 bits, 64 bits, and 128 bits in size, respectively. Where a | platform has "native" support for IEEE-Standard floating-point formats, | the types below may, if desired, be defined as aliases for the native types | (typically 'float' and 'double', and possibly 'long double'). *----------------------------------------------------------------------------*/ typedef struct { uint16_t v; } float16_t; typedef struct { uint32_t v; } float32_t; typedef struct { uint64_t v; } float64_t; typedef struct { uint64_t v[2]; } float128_t; /*---------------------------------------------------------------------------- | The format of an 80-bit extended floating-point number in memory. This | structure must contain a 16-bit field named 'signExp' and a 64-bit field | named 'signif'. *----------------------------------------------------------------------------*/ #ifdef LITTLEENDIAN struct extFloat80M { uint64_t signif; uint16_t signExp; }; #else struct extFloat80M { uint16_t signExp; uint64_t signif; }; #endif /*---------------------------------------------------------------------------- | The type used to pass 80-bit extended floating-point arguments and | results to/from functions. This type must have size identical to | 'struct extFloat80M'. Type 'extFloat80_t' can be defined as an alias for | 'struct extFloat80M'. Alternatively, if a platform has "native" support | for IEEE-Standard 80-bit extended floating-point, it may be possible, | if desired, to define 'extFloat80_t' as an alias for the native type | (presumably either 'long double' or a nonstandard compiler-intrinsic type). | In that case, the 'signif' and 'signExp' fields of 'struct extFloat80M' | must align exactly with the locations in memory of the sign, exponent, and | significand of the native type. *----------------------------------------------------------------------------*/ typedef struct extFloat80M extFloat80_t; enum { softfloat_tininess_beforeRounding = 0, softfloat_tininess_afterRounding = 1 }; enum { softfloat_round_near_even = 0, softfloat_round_minMag = 1, softfloat_round_min = 2, softfloat_round_max = 3, softfloat_round_near_maxMag = 4, softfloat_round_odd = 6 }; enum { softfloat_flag_inexact = 1, softfloat_flag_underflow = 2, softfloat_flag_overflow = 4, softfloat_flag_infinite = 8, softfloat_flag_invalid = 16 }; struct softfloat_state { /*---------------------------------------------------------------------------- | Software floating-point underflow tininess-detection mode. *----------------------------------------------------------------------------*/ uint8_t detectTininess; /* = init_detectTininess */ /*---------------------------------------------------------------------------- | Software floating-point rounding mode. (Mode "odd" is supported only if | SoftFloat is compiled with macro 'SOFTFLOAT_ROUND_ODD' defined.) *----------------------------------------------------------------------------*/ uint8_t roundingMode; /* = softfloat_round_near_even */ /*---------------------------------------------------------------------------- | Software floating-point exception flags. *----------------------------------------------------------------------------*/ uint8_t exceptionFlags; /* = 0 */ /*---------------------------------------------------------------------------- | Rounding precision for 80-bit extended double-precision floating-point. | Valid values are 32, 64, and 80. *----------------------------------------------------------------------------*/ uint8_t roundingPrecision; /* = 80 */ }; #endif ================================================ FILE: External/SoftFloat-3e/src/extF80_add.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_add( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool signA; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; bool signB; extFloat80_t (*magsFuncPtr)( struct softfloat_state *, uint_fast16_t, uint_fast64_t, uint_fast16_t, uint_fast64_t, bool ); uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; signA = signExtF80UI64( uiA64 ); uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; signB = signExtF80UI64( uiB64 ); magsFuncPtr = (signA == signB) ? softfloat_addMagsExtF80 : softfloat_subMagsExtF80; return (*magsFuncPtr)( state, uiA64, uiA0, uiB64, uiB0, signA ); } ================================================ FILE: External/SoftFloat-3e/src/extF80_div.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_div( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool signA; int_fast32_t expA; uint_fast64_t sigA; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; bool signB; int_fast32_t expB; uint_fast64_t sigB; bool signZ; struct exp32_sig64 normExpSig; int_fast32_t expZ; struct uint128 rem; uint_fast32_t recip32; uint_fast64_t sigZ; int ix; uint_fast64_t q64; uint_fast32_t q; struct uint128 term; uint_fast64_t sigZExtra; struct uint128 uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; signA = signExtF80UI64( uiA64 ); expA = expExtF80UI64( uiA64 ); sigA = uiA0; uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; signB = signExtF80UI64( uiB64 ); expB = expExtF80UI64( uiB64 ); sigB = uiB0; signZ = signA ^ signB; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; if ( expB == 0x7FFF ) { if ( sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; goto invalid; } goto infinity; } if ( expB == 0x7FFF ) { if ( sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; goto zero; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expB ) expB = 1; if ( ! (sigB & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigB ) { if ( ! sigA ) goto invalid; softfloat_raiseFlags( state, softfloat_flag_infinite ); goto infinity; } normExpSig = softfloat_normSubnormalExtF80Sig( sigB ); expB += normExpSig.exp; sigB = normExpSig.sig; } if ( ! expA ) expA = 1; if ( ! (sigA & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigA ) goto zero; normExpSig = softfloat_normSubnormalExtF80Sig( sigA ); expA += normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expZ = expA - expB + 0x3FFF; if ( sigA < sigB ) { --expZ; rem = softfloat_shortShiftLeft128( 0, sigA, 32 ); } else { rem = softfloat_shortShiftLeft128( 0, sigA, 31 ); } recip32 = softfloat_approxRecip32_1( sigB>>32 ); sigZ = 0; ix = 2; for (;;) { q64 = (uint_fast64_t) (uint32_t) (rem.v64>>2) * recip32; q = (q64 + 0x80000000)>>32; --ix; if ( ix < 0 ) break; rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); term = softfloat_mul64ByShifted32To128( sigB, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { --q; rem = softfloat_add128( rem.v64, rem.v0, sigB>>32, sigB<<32 ); } sigZ = (sigZ<<29) + q; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ((q + 1) & 0x3FFFFF) < 2 ) { rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); term = softfloat_mul64ByShifted32To128( sigB, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); term = softfloat_shortShiftLeft128( 0, sigB, 32 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { --q; rem = softfloat_add128( rem.v64, rem.v0, term.v64, term.v0 ); } else if ( softfloat_le128( term.v64, term.v0, rem.v64, rem.v0 ) ) { ++q; rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); } if ( rem.v64 | rem.v0 ) q |= 1; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sigZ = (sigZ<<6) + (q>>23); sigZExtra = (uint64_t) ((uint_fast64_t) q<<41); return softfloat_roundPackToExtF80( state, signZ, expZ, sigZ, sigZExtra, state->roundingPrecision ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ64 = defaultNaNExtF80UI64; uiZ0 = defaultNaNExtF80UI0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ infinity: uiZ64 = packToExtF80UI64( signZ, 0x7FFF ); uiZ0 = UINT64_C( 0x8000000000000000 ); goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ zero: uiZ64 = packToExtF80UI64( signZ, 0 ); uiZ0 = 0; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_eq.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR bool extF80_eq( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; if ( isNaNExtF80UI( uiA64, uiA0 ) || isNaNExtF80UI( uiB64, uiB0 ) ) { if ( softfloat_isSigNaNExtF80UI( uiA64, uiA0 ) || softfloat_isSigNaNExtF80UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } return false; } return (uiA0 == uiB0) && ((uiA64 == uiB64) || (! uiA0 && ! ((uiA64 | uiB64) & 0x7FFF))); } ================================================ FILE: External/SoftFloat-3e/src/extF80_le.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" bool extF80_le( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; bool signA, signB; uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; if ( isNaNExtF80UI( uiA64, uiA0 ) || isNaNExtF80UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return false; } signA = signExtF80UI64( uiA64 ); signB = signExtF80UI64( uiB64 ); return (signA != signB) ? signA || ! (((uiA64 | uiB64) & 0x7FFF) | uiA0 | uiB0) : ((uiA64 == uiB64) && (uiA0 == uiB0)) || (signA ^ softfloat_lt128( uiA64, uiA0, uiB64, uiB0 )); } ================================================ FILE: External/SoftFloat-3e/src/extF80_lt.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR bool extF80_lt( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; bool signA, signB; uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; if ( isNaNExtF80UI( uiA64, uiA0 ) || isNaNExtF80UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return false; } signA = signExtF80UI64( uiA64 ); signB = signExtF80UI64( uiB64 ); return (signA != signB) ? signA && (((uiA64 | uiB64) & 0x7FFF) | uiA0 | uiB0) : ((uiA64 != uiB64) || (uiA0 != uiB0)) && (signA ^ softfloat_lt128( uiA64, uiA0, uiB64, uiB0 )); } ================================================ FILE: External/SoftFloat-3e/src/extF80_mul.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_mul( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool signA; int_fast32_t expA; uint_fast64_t sigA; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; bool signB; int_fast32_t expB; uint_fast64_t sigB; bool signZ; uint_fast64_t magBits; struct exp32_sig64 normExpSig; int_fast32_t expZ; struct uint128 sig128Z, uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; signA = signExtF80UI64( uiA64 ); expA = expExtF80UI64( uiA64 ); sigA = uiA0; uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; signB = signExtF80UI64( uiB64 ); expB = expExtF80UI64( uiB64 ); sigB = uiB0; signZ = signA ^ signB; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( (sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF )) || ((expB == 0x7FFF) && (sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ))) ) { goto propagateNaN; } magBits = expB | sigB; goto infArg; } if ( expB == 0x7FFF ) { if ( sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; magBits = expA | sigA; goto infArg; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expA ) expA = 1; if ( ! (sigA & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigA ) goto zero; normExpSig = softfloat_normSubnormalExtF80Sig( sigA ); expA += normExpSig.exp; sigA = normExpSig.sig; } if ( ! expB ) expB = 1; if ( ! (sigB & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigB ) goto zero; normExpSig = softfloat_normSubnormalExtF80Sig( sigB ); expB += normExpSig.exp; sigB = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expZ = expA + expB - 0x3FFE; sig128Z = softfloat_mul64To128( sigA, sigB ); if ( sig128Z.v64 < UINT64_C( 0x8000000000000000 ) ) { --expZ; sig128Z = softfloat_add128( sig128Z.v64, sig128Z.v0, sig128Z.v64, sig128Z.v0 ); } return softfloat_roundPackToExtF80( state, signZ, expZ, sig128Z.v64, sig128Z.v0, state->roundingPrecision ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ infArg: if ( ! magBits ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ64 = defaultNaNExtF80UI64; uiZ0 = defaultNaNExtF80UI0; } else { uiZ64 = packToExtF80UI64( signZ, 0x7FFF ); uiZ0 = UINT64_C( 0x8000000000000000 ); } goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ zero: uiZ64 = packToExtF80UI64( signZ, 0 ); uiZ0 = 0; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_rem.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_rem( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool signA; int_fast32_t expA; uint_fast64_t sigA; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; int_fast32_t expB; uint_fast64_t sigB; struct exp32_sig64 normExpSig; int_fast32_t expDiff; struct uint128 rem, shiftedSigB; uint_fast32_t q, recip32; uint_fast64_t q64; struct uint128 term, altRem, meanRem; bool signRem; struct uint128 uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; signA = signExtF80UI64( uiA64 ); expA = expExtF80UI64( uiA64 ); sigA = uiA0; uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; expB = expExtF80UI64( uiB64 ); sigB = uiB0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( (sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF )) || ((expB == 0x7FFF) && (sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ))) ) { goto propagateNaN; } goto invalid; } if ( expB == 0x7FFF ) { if ( sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; /*-------------------------------------------------------------------- | Argument b is an infinity. Doubling `expB' is an easy way to ensure | that `expDiff' later is less than -1, which will result in returning | a canonicalized version of argument a. *--------------------------------------------------------------------*/ expB += expB; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expB ) expB = 1; if ( ! (sigB & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigB ) goto invalid; normExpSig = softfloat_normSubnormalExtF80Sig( sigB ); expB += normExpSig.exp; sigB = normExpSig.sig; } if ( ! expA ) expA = 1; if ( ! (sigA & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigA ) { expA = 0; goto copyA; } normExpSig = softfloat_normSubnormalExtF80Sig( sigA ); expA += normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expDiff = expA - expB; if ( expDiff < -1 ) goto copyA; rem = softfloat_shortShiftLeft128( 0, sigA, 32 ); shiftedSigB = softfloat_shortShiftLeft128( 0, sigB, 32 ); if ( expDiff < 1 ) { if ( expDiff ) { --expB; shiftedSigB = softfloat_shortShiftLeft128( 0, sigB, 33 ); q = 0; } else { q = (sigB <= sigA); if ( q ) { rem = softfloat_sub128( rem.v64, rem.v0, shiftedSigB.v64, shiftedSigB.v0 ); } } } else { recip32 = softfloat_approxRecip32_1( sigB>>32 ); expDiff -= 30; for (;;) { q64 = (uint_fast64_t) (uint32_t) (rem.v64>>2) * recip32; if ( expDiff < 0 ) break; q = (q64 + 0x80000000)>>32; rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); term = softfloat_mul64ByShifted32To128( sigB, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { rem = softfloat_add128( rem.v64, rem.v0, shiftedSigB.v64, shiftedSigB.v0 ); } expDiff -= 29; } /*-------------------------------------------------------------------- | (`expDiff' cannot be less than -29 here.) *--------------------------------------------------------------------*/ q = (uint32_t) (q64>>32)>>(~expDiff & 31); rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, expDiff + 30 ); term = softfloat_mul64ByShifted32To128( sigB, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { altRem = softfloat_add128( rem.v64, rem.v0, shiftedSigB.v64, shiftedSigB.v0 ); goto selectRem; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ do { altRem = rem; ++q; rem = softfloat_sub128( rem.v64, rem.v0, shiftedSigB.v64, shiftedSigB.v0 ); } while ( ! (rem.v64 & UINT64_C( 0x8000000000000000 )) ); selectRem: meanRem = softfloat_add128( rem.v64, rem.v0, altRem.v64, altRem.v0 ); if ( (meanRem.v64 & UINT64_C( 0x8000000000000000 )) || (! (meanRem.v64 | meanRem.v0) && (q & 1)) ) { rem = altRem; } signRem = signA; if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { signRem = ! signRem; rem = softfloat_sub128( 0, 0, rem.v64, rem.v0 ); } return softfloat_normRoundPackToExtF80( state, signRem, rem.v64 | rem.v0 ? expB + 32 : 0, rem.v64, rem.v0, 80 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ64 = defaultNaNExtF80UI64; uiZ0 = defaultNaNExtF80UI0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ copyA: if ( expA < 1 ) { sigA >>= 1 - expA; expA = 0; } uiZ64 = packToExtF80UI64( signA, expA ); uiZ0 = sigA; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_roundToInt.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_roundToInt( struct softfloat_state *state, extFloat80_t a, uint_fast8_t roundingMode, bool exact ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64, signUI64; int_fast32_t exp; uint_fast64_t sigA; uint_fast16_t uiZ64; uint_fast64_t sigZ; struct exp32_sig64 normExpSig; struct uint128 uiZ; uint_fast64_t lastBitMask, roundBitsMask; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; signUI64 = uiA64 & packToExtF80UI64( 1, 0 ); exp = expExtF80UI64( uiA64 ); sigA = uA.s.signif; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( !(sigA & UINT64_C( 0x8000000000000000 )) && (exp != 0x7FFF) ) { if ( !sigA ) { uiZ64 = signUI64; sigZ = 0; goto uiZ; } normExpSig = softfloat_normSubnormalExtF80Sig( sigA ); exp += normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( 0x403E <= exp ) { if ( exp == 0x7FFF ) { if ( sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) { uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, sigA, 0, 0 ); uiZ64 = uiZ.v64; sigZ = uiZ.v0; goto uiZ; } sigZ = UINT64_C( 0x8000000000000000 ); } else { sigZ = sigA; } uiZ64 = signUI64 | exp; goto uiZ; } if ( exp <= 0x3FFE ) { if ( exact ) state->exceptionFlags |= softfloat_flag_inexact; switch ( roundingMode ) { case softfloat_round_near_even: if ( !(sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF )) ) break; __attribute__((fallthrough)); case softfloat_round_near_maxMag: if ( exp == 0x3FFE ) goto mag1; break; case softfloat_round_min: if ( signUI64 ) goto mag1; break; case softfloat_round_max: if ( !signUI64 ) goto mag1; break; #ifdef SOFTFLOAT_ROUND_ODD case softfloat_round_odd: goto mag1; #endif } uiZ64 = signUI64; sigZ = 0; goto uiZ; mag1: uiZ64 = signUI64 | 0x3FFF; sigZ = UINT64_C( 0x8000000000000000 ); goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ64 = signUI64 | exp; lastBitMask = (uint_fast64_t) 1<<(0x403E - exp); roundBitsMask = lastBitMask - 1; sigZ = sigA; if ( roundingMode == softfloat_round_near_maxMag ) { sigZ += lastBitMask>>1; } else if ( roundingMode == softfloat_round_near_even ) { sigZ += lastBitMask>>1; if ( !(sigZ & roundBitsMask) ) sigZ &= ~lastBitMask; } else if ( roundingMode == (signUI64 ? softfloat_round_min : softfloat_round_max) ) { sigZ += roundBitsMask; } sigZ &= ~roundBitsMask; if ( !sigZ ) { ++uiZ64; sigZ = UINT64_C( 0x8000000000000000 ); } if ( sigZ != sigA ) { #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) sigZ |= lastBitMask; #endif if ( exact ) state->exceptionFlags |= softfloat_flag_inexact; } uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = sigZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_sqrt.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_sqrt( struct softfloat_state *state, extFloat80_t a ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool signA; int_fast32_t expA; uint_fast64_t sigA; struct uint128 uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; struct exp32_sig64 normExpSig; int_fast32_t expZ; uint_fast32_t sig32A, recipSqrt32, sig32Z; struct uint128 rem; uint_fast64_t q, x64, sigZ; struct uint128 y, term; uint_fast64_t sigZExtra; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; signA = signExtF80UI64( uiA64 ); expA = expExtF80UI64( uiA64 ); sigA = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) { uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, uiA0, 0, 0 ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; goto uiZ; } if ( ! signA ) return a; goto invalid; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( signA ) { if ( ! sigA ) goto zero; goto invalid; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expA ) expA = 1; if ( ! (sigA & UINT64_C( 0x8000000000000000 )) ) { if ( ! sigA ) goto zero; normExpSig = softfloat_normSubnormalExtF80Sig( sigA ); expA += normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ | (`sig32Z' is guaranteed to be a lower bound on the square root of | `sig32A', which makes `sig32Z' also a lower bound on the square root of | `sigA'.) *------------------------------------------------------------------------*/ expZ = ((expA - 0x3FFF)>>1) + 0x3FFF; expA &= 1; sig32A = sigA>>32; recipSqrt32 = softfloat_approxRecipSqrt32_1( expA, sig32A ); sig32Z = ((uint_fast64_t) sig32A * recipSqrt32)>>32; if ( expA ) { sig32Z >>= 1; rem = softfloat_shortShiftLeft128( 0, sigA, 61 ); } else { rem = softfloat_shortShiftLeft128( 0, sigA, 62 ); } rem.v64 -= (uint_fast64_t) sig32Z * sig32Z; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ q = ((uint32_t) (rem.v64>>2) * (uint_fast64_t) recipSqrt32)>>32; x64 = (uint_fast64_t) sig32Z<<32; sigZ = x64 + (q<<3); y = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); /*------------------------------------------------------------------------ | (Repeating this loop is a rare occurrence.) *------------------------------------------------------------------------*/ for (;;) { term = softfloat_mul64ByShifted32To128( x64 + sigZ, q ); rem = softfloat_sub128( y.v64, y.v0, term.v64, term.v0 ); if ( ! (rem.v64 & UINT64_C( 0x8000000000000000 )) ) break; --q; sigZ -= 1<<3; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ q = (((rem.v64>>2) * recipSqrt32)>>32) + 2; x64 = sigZ; sigZ = (sigZ<<1) + (q>>25); sigZExtra = (uint64_t) (q<<39); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( (q & 0xFFFFFF) <= 2 ) { q &= ~(uint_fast64_t) 0xFFFF; sigZExtra = (uint64_t) (q<<39); term = softfloat_mul64ByShifted32To128( x64 + (q>>27), q ); x64 = (uint32_t) (q<<5) * (uint_fast64_t) (uint32_t) q; term = softfloat_add128( term.v64, term.v0, 0, x64 ); rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 28 ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { if ( ! sigZExtra ) --sigZ; --sigZExtra; } else { if ( rem.v64 | rem.v0 ) sigZExtra |= 1; } } return softfloat_roundPackToExtF80( state, 0, expZ, sigZ, sigZExtra, state->roundingPrecision ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ64 = defaultNaNExtF80UI64; uiZ0 = defaultNaNExtF80UI0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ zero: uiZ64 = packToExtF80UI64( signA, 0 ); uiZ0 = 0; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_sub.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t extF80_sub( struct softfloat_state *state, extFloat80_t a, extFloat80_t b ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool signA; union { struct extFloat80M s; extFloat80_t f; } uB; uint_fast16_t uiB64; uint_fast64_t uiB0; bool signB; #if ! defined INLINE_LEVEL || (INLINE_LEVEL < 2) extFloat80_t (*magsFuncPtr)( struct softfloat_state *, uint_fast16_t, uint_fast64_t, uint_fast16_t, uint_fast64_t, bool ); #endif uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; signA = signExtF80UI64( uiA64 ); uB.f = b; uiB64 = uB.s.signExp; uiB0 = uB.s.signif; signB = signExtF80UI64( uiB64 ); #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) if ( signA == signB ) { return softfloat_subMagsExtF80( state, uiA64, uiA0, uiB64, uiB0, signA ); } else { return softfloat_addMagsExtF80( state, uiA64, uiA0, uiB64, uiB0, signA ); } #else magsFuncPtr = (signA == signB) ? softfloat_subMagsExtF80 : softfloat_addMagsExtF80; return (*magsFuncPtr)( state, uiA64, uiA0, uiB64, uiB0, signA ); #endif } ================================================ FILE: External/SoftFloat-3e/src/extF80_to_f128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR float128_t extF80_to_f128( struct softfloat_state *state, extFloat80_t a ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; uint_fast16_t exp; uint_fast64_t frac; struct commonNaN commonNaN; struct uint128 uiZ; bool sign; struct uint128 frac128; union ui128_f128 uZ; uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; exp = expExtF80UI64( uiA64 ); frac = uiA0 & UINT64_C( 0x7FFFFFFFFFFFFFFF ); if ( (exp == 0x7FFF) && frac ) { softfloat_extF80UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToF128UI( &commonNaN ); } else { sign = signExtF80UI64( uiA64 ); frac128 = softfloat_shortShiftLeft128( 0, frac, 49 ); uiZ.v64 = packToF128UI64( sign, exp, frac128.v64 ); uiZ.v0 = frac128.v0; } uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_to_f32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR float32_t extF80_to_f32( struct softfloat_state *state, extFloat80_t a ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool sign; int_fast32_t exp; uint_fast64_t sig; struct commonNaN commonNaN; uint_fast32_t uiZ, sig32; union ui32_f32 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; sign = signExtF80UI64( uiA64 ); exp = expExtF80UI64( uiA64 ); sig = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FFF ) { if ( sig & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) { softfloat_extF80UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToF32UI( &commonNaN ); } else { uiZ = packToF32UI( sign, 0xFF, 0 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sig32 = softfloat_shortShiftRightJam64( sig, 33 ); if ( ! (exp | sig32) ) { uiZ = packToF32UI( sign, 0, 0 ); goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ exp -= 0x3F81; if ( sizeof (int_fast16_t) < sizeof (int_fast32_t) ) { if ( exp < -0x1000 ) exp = -0x1000; } return softfloat_roundPackToF32( state, sign, exp, sig32 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_to_f64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR float64_t extF80_to_f64( struct softfloat_state *state, extFloat80_t a ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; uint_fast64_t uiA0; bool sign; int_fast32_t exp; uint_fast64_t sig; struct commonNaN commonNaN; uint_fast64_t uiZ; union ui64_f64 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; uiA0 = uA.s.signif; sign = signExtF80UI64( uiA64 ); exp = expExtF80UI64( uiA64 ); sig = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! (exp | sig) ) { uiZ = packToF64UI( sign, 0, 0 ); goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FFF ) { if ( sig & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) { softfloat_extF80UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToF64UI( &commonNaN ); } else { uiZ = packToF64UI( sign, 0x7FF, 0 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sig = softfloat_shortShiftRightJam64( sig, 1 ); exp -= 0x3C01; if ( sizeof (int_fast16_t) < sizeof (int_fast32_t) ) { if ( exp < -0x1000 ) exp = -0x1000; } return softfloat_roundPackToF64( state, sign, exp, sig ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/extF80_to_i32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR int_fast32_t extF80_to_i32( struct softfloat_state *state, extFloat80_t a, uint_fast8_t roundingMode, bool exact ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; bool sign; int_fast32_t exp; uint_fast64_t sig; int_fast32_t shiftDist; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; sign = signExtF80UI64( uiA64 ); exp = expExtF80UI64( uiA64 ); sig = uA.s.signif; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ #if (i32_fromNaN != i32_fromPosOverflow) || (i32_fromNaN != i32_fromNegOverflow) if ( (exp == 0x7FFF) && (sig & UINT64_C( 0x7FFFFFFFFFFFFFFF )) ) { #if (i32_fromNaN == i32_fromPosOverflow) sign = 0; #elif (i32_fromNaN == i32_fromNegOverflow) sign = 1; #else softfloat_raiseFlags( state, softfloat_flag_invalid ); return i32_fromNaN; #endif } #endif /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ shiftDist = 0x4032 - exp; if ( shiftDist <= 0 ) shiftDist = 1; sig = softfloat_shiftRightJam64( sig, shiftDist ); return softfloat_roundToI32( state, sign, sig, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/extF80_to_i64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR int_fast64_t extF80_to_i64( struct softfloat_state *state, extFloat80_t a, uint_fast8_t roundingMode, bool exact ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; bool sign; int_fast32_t exp; uint_fast64_t sig; int_fast32_t shiftDist; uint_fast64_t sigExtra; struct uint64_extra sig64Extra; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; sign = signExtF80UI64( uiA64 ); exp = expExtF80UI64( uiA64 ); sig = uA.s.signif; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ shiftDist = 0x403E - exp; if ( shiftDist <= 0 ) { /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ if ( shiftDist ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return (exp == 0x7FFF) && (sig & UINT64_C( 0x7FFFFFFFFFFFFFFF )) ? i64_fromNaN : sign ? i64_fromNegOverflow : i64_fromPosOverflow; } /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ sigExtra = 0; } else { /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ sig64Extra = softfloat_shiftRightJam64Extra( sig, 0, shiftDist ); sig = sig64Extra.v; sigExtra = sig64Extra.extra; } return softfloat_roundToI64( state, sign, sig, sigExtra, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/extF80_to_ui64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR uint_fast64_t extF80_to_ui64( struct softfloat_state *state, extFloat80_t a, uint_fast8_t roundingMode, bool exact ) { union { struct extFloat80M s; extFloat80_t f; } uA; uint_fast16_t uiA64; bool sign; int_fast32_t exp; uint_fast64_t sig; int_fast32_t shiftDist; uint_fast64_t sigExtra; struct uint64_extra sig64Extra; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.s.signExp; sign = signExtF80UI64( uiA64 ); exp = expExtF80UI64( uiA64 ); sig = uA.s.signif; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ shiftDist = 0x403E - exp; if ( shiftDist < 0 ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return (exp == 0x7FFF) && (sig & UINT64_C( 0x7FFFFFFFFFFFFFFF )) ? ui64_fromNaN : sign ? ui64_fromNegOverflow : ui64_fromPosOverflow; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sigExtra = 0; if ( shiftDist ) { sig64Extra = softfloat_shiftRightJam64Extra( sig, 0, shiftDist ); sig = sig64Extra.v; sigExtra = sig64Extra.extra; } return softfloat_roundToUI64( state, sign, sig, sigExtra, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/f128_add.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" float128_t f128_add( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool signA; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signB; #if ! defined INLINE_LEVEL || (INLINE_LEVEL < 2) float128_t (*magsFuncPtr)( uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, bool ); #endif uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; signA = signF128UI64( uiA64 ); uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; signB = signF128UI64( uiB64 ); #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) if ( signA == signB ) { return softfloat_addMagsF128( state, uiA64, uiA0, uiB64, uiB0, signA ); } else { return softfloat_subMagsF128( state, uiA64, uiA0, uiB64, uiB0, signA ); } #else magsFuncPtr = (signA == signB) ? softfloat_addMagsF128 : softfloat_subMagsF128; return (*magsFuncPtr)( uiA64, uiA0, uiB64, uiB0, signA ); #endif } ================================================ FILE: External/SoftFloat-3e/src/f128_div.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float128_t f128_div( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool signA; int_fast32_t expA; struct uint128 sigA; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signB; int_fast32_t expB; struct uint128 sigB; bool signZ; struct exp32_sig128 normExpSig; int_fast32_t expZ; struct uint128 rem; uint_fast32_t recip32; int ix; uint_fast64_t q64; uint_fast32_t q; struct uint128 term; uint_fast32_t qs[3]; uint_fast64_t sigZExtra; struct uint128 sigZ, uiZ; union ui128_f128 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; signA = signF128UI64( uiA64 ); expA = expF128UI64( uiA64 ); sigA.v64 = fracF128UI64( uiA64 ); sigA.v0 = uiA0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; signB = signF128UI64( uiB64 ); expB = expF128UI64( uiB64 ); sigB.v64 = fracF128UI64( uiB64 ); sigB.v0 = uiB0; signZ = signA ^ signB; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( sigA.v64 | sigA.v0 ) goto propagateNaN; if ( expB == 0x7FFF ) { if ( sigB.v64 | sigB.v0 ) goto propagateNaN; goto invalid; } goto infinity; } if ( expB == 0x7FFF ) { if ( sigB.v64 | sigB.v0 ) goto propagateNaN; goto zero; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expB ) { if ( ! (sigB.v64 | sigB.v0) ) { if ( ! (expA | sigA.v64 | sigA.v0) ) goto invalid; softfloat_raiseFlags( state, softfloat_flag_infinite ); goto infinity; } normExpSig = softfloat_normSubnormalF128Sig( sigB.v64, sigB.v0 ); expB = normExpSig.exp; sigB = normExpSig.sig; } if ( ! expA ) { if ( ! (sigA.v64 | sigA.v0) ) goto zero; normExpSig = softfloat_normSubnormalF128Sig( sigA.v64, sigA.v0 ); expA = normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expZ = expA - expB + 0x3FFE; sigA.v64 |= UINT64_C( 0x0001000000000000 ); sigB.v64 |= UINT64_C( 0x0001000000000000 ); rem = sigA; if ( softfloat_lt128( sigA.v64, sigA.v0, sigB.v64, sigB.v0 ) ) { --expZ; rem = softfloat_add128( sigA.v64, sigA.v0, sigA.v64, sigA.v0 ); } recip32 = softfloat_approxRecip32_1( sigB.v64>>17 ); ix = 3; for (;;) { q64 = (uint_fast64_t) (uint32_t) (rem.v64>>19) * recip32; q = (q64 + 0x80000000)>>32; --ix; if ( ix < 0 ) break; rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); term = softfloat_mul128By32( sigB.v64, sigB.v0, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { --q; rem = softfloat_add128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); } qs[ix] = q; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ((q + 1) & 7) < 2 ) { rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); term = softfloat_mul128By32( sigB.v64, sigB.v0, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { --q; rem = softfloat_add128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); } else if ( softfloat_le128( sigB.v64, sigB.v0, rem.v64, rem.v0 ) ) { ++q; rem = softfloat_sub128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); } if ( rem.v64 | rem.v0 ) q |= 1; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sigZExtra = (uint64_t) ((uint_fast64_t) q<<60); term = softfloat_shortShiftLeft128( 0, qs[1], 54 ); sigZ = softfloat_add128( (uint_fast64_t) qs[2]<<19, ((uint_fast64_t) qs[0]<<25) + (q>>4), term.v64, term.v0 ); return softfloat_roundPackToF128( state, signZ, expZ, sigZ.v64, sigZ.v0, sigZExtra ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNF128UI( state, uiA64, uiA0, uiB64, uiB0 ); goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ.v64 = defaultNaNF128UI64; uiZ.v0 = defaultNaNF128UI0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ infinity: uiZ.v64 = packToF128UI64( signZ, 0x7FFF, 0 ); goto uiZ0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ zero: uiZ.v64 = packToF128UI64( signZ, 0, 0 ); uiZ0: uiZ.v0 = 0; uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_eq.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" bool f128_eq( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; if ( isNaNF128UI( uiA64, uiA0 ) || isNaNF128UI( uiB64, uiB0 ) ) { if ( softfloat_isSigNaNF128UI( uiA64, uiA0 ) || softfloat_isSigNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } return false; } return (uiA0 == uiB0) && ( (uiA64 == uiB64) || (! uiA0 && ! ((uiA64 | uiB64) & UINT64_C( 0x7FFFFFFFFFFFFFFF ))) ); } ================================================ FILE: External/SoftFloat-3e/src/f128_eq_signaling.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" bool f128_eq_signaling( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; if ( isNaNF128UI( uiA64, uiA0 ) || isNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return false; } return (uiA0 == uiB0) && ( (uiA64 == uiB64) || (! uiA0 && ! ((uiA64 | uiB64) & UINT64_C( 0x7FFFFFFFFFFFFFFF ))) ); } ================================================ FILE: External/SoftFloat-3e/src/f128_isSignalingNaN.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" bool f128_isSignalingNaN( float128_t a ) { union ui128_f128 uA; uA.f = a; return softfloat_isSigNaNF128UI( uA.ui.v64, uA.ui.v0 ); } ================================================ FILE: External/SoftFloat-3e/src/f128_le.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" bool f128_le( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signA, signB; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; if ( isNaNF128UI( uiA64, uiA0 ) || isNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return false; } signA = signF128UI64( uiA64 ); signB = signF128UI64( uiB64 ); return (signA != signB) ? signA || ! (((uiA64 | uiB64) & UINT64_C( 0x7FFFFFFFFFFFFFFF )) | uiA0 | uiB0) : ((uiA64 == uiB64) && (uiA0 == uiB0)) || (signA ^ softfloat_lt128( uiA64, uiA0, uiB64, uiB0 )); } ================================================ FILE: External/SoftFloat-3e/src/f128_le_quiet.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" bool f128_le_quiet( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signA, signB; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; if ( isNaNF128UI( uiA64, uiA0 ) || isNaNF128UI( uiB64, uiB0 ) ) { if ( softfloat_isSigNaNF128UI( uiA64, uiA0 ) || softfloat_isSigNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } return false; } signA = signF128UI64( uiA64 ); signB = signF128UI64( uiB64 ); return (signA != signB) ? signA || ! (((uiA64 | uiB64) & UINT64_C( 0x7FFFFFFFFFFFFFFF )) | uiA0 | uiB0) : ((uiA64 == uiB64) && (uiA0 == uiB0)) || (signA ^ softfloat_lt128( uiA64, uiA0, uiB64, uiB0 )); } ================================================ FILE: External/SoftFloat-3e/src/f128_lt.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" bool f128_lt( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signA, signB; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; if ( isNaNF128UI( uiA64, uiA0 ) || isNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return false; } signA = signF128UI64( uiA64 ); signB = signF128UI64( uiB64 ); return (signA != signB) ? signA && (((uiA64 | uiB64) & UINT64_C( 0x7FFFFFFFFFFFFFFF )) | uiA0 | uiB0) : ((uiA64 != uiB64) || (uiA0 != uiB0)) && (signA ^ softfloat_lt128( uiA64, uiA0, uiB64, uiB0 )); } ================================================ FILE: External/SoftFloat-3e/src/f128_lt_quiet.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" bool f128_lt_quiet( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signA, signB; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; if ( isNaNF128UI( uiA64, uiA0 ) || isNaNF128UI( uiB64, uiB0 ) ) { if ( softfloat_isSigNaNF128UI( uiA64, uiA0 ) || softfloat_isSigNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } return false; } signA = signF128UI64( uiA64 ); signB = signF128UI64( uiB64 ); return (signA != signB) ? signA && (((uiA64 | uiB64) & UINT64_C( 0x7FFFFFFFFFFFFFFF )) | uiA0 | uiB0) : ((uiA64 != uiB64) || (uiA0 != uiB0)) && (signA ^ softfloat_lt128( uiA64, uiA0, uiB64, uiB0 )); } ================================================ FILE: External/SoftFloat-3e/src/f128_mul.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float128_t f128_mul( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool signA; int_fast32_t expA; struct uint128 sigA; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signB; int_fast32_t expB; struct uint128 sigB; bool signZ; uint_fast64_t magBits; struct exp32_sig128 normExpSig; int_fast32_t expZ; uint64_t sig256Z[4]; uint_fast64_t sigZExtra; struct uint128 sigZ; struct uint128_extra sig128Extra; struct uint128 uiZ; union ui128_f128 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; signA = signF128UI64( uiA64 ); expA = expF128UI64( uiA64 ); sigA.v64 = fracF128UI64( uiA64 ); sigA.v0 = uiA0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; signB = signF128UI64( uiB64 ); expB = expF128UI64( uiB64 ); sigB.v64 = fracF128UI64( uiB64 ); sigB.v0 = uiB0; signZ = signA ^ signB; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( (sigA.v64 | sigA.v0) || ((expB == 0x7FFF) && (sigB.v64 | sigB.v0)) ) { goto propagateNaN; } magBits = expB | sigB.v64 | sigB.v0; goto infArg; } if ( expB == 0x7FFF ) { if ( sigB.v64 | sigB.v0 ) goto propagateNaN; magBits = expA | sigA.v64 | sigA.v0; goto infArg; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expA ) { if ( ! (sigA.v64 | sigA.v0) ) goto zero; normExpSig = softfloat_normSubnormalF128Sig( sigA.v64, sigA.v0 ); expA = normExpSig.exp; sigA = normExpSig.sig; } if ( ! expB ) { if ( ! (sigB.v64 | sigB.v0) ) goto zero; normExpSig = softfloat_normSubnormalF128Sig( sigB.v64, sigB.v0 ); expB = normExpSig.exp; sigB = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expZ = expA + expB - 0x4000; sigA.v64 |= UINT64_C( 0x0001000000000000 ); sigB = softfloat_shortShiftLeft128( sigB.v64, sigB.v0, 16 ); softfloat_mul128To256M( sigA.v64, sigA.v0, sigB.v64, sigB.v0, sig256Z ); sigZExtra = sig256Z[indexWord( 4, 1 )] | (sig256Z[indexWord( 4, 0 )] != 0); sigZ = softfloat_add128( sig256Z[indexWord( 4, 3 )], sig256Z[indexWord( 4, 2 )], sigA.v64, sigA.v0 ); if ( UINT64_C( 0x0002000000000000 ) <= sigZ.v64 ) { ++expZ; sig128Extra = softfloat_shortShiftRightJam128Extra( sigZ.v64, sigZ.v0, sigZExtra, 1 ); sigZ = sig128Extra.v; sigZExtra = sig128Extra.extra; } return softfloat_roundPackToF128( state, signZ, expZ, sigZ.v64, sigZ.v0, sigZExtra ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNF128UI( state, uiA64, uiA0, uiB64, uiB0 ); goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ infArg: if ( ! magBits ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ.v64 = defaultNaNF128UI64; uiZ.v0 = defaultNaNF128UI0; goto uiZ; } uiZ.v64 = packToF128UI64( signZ, 0x7FFF, 0 ); goto uiZ0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ zero: uiZ.v64 = packToF128UI64( signZ, 0, 0 ); uiZ0: uiZ.v0 = 0; uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_mulAdd.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "internals.h" #include "softfloat.h" float128_t f128_mulAdd( struct softfloat_state *state, float128_t a, float128_t b, float128_t c ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; union ui128_f128 uC; uint_fast64_t uiC64, uiC0; uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; uC.f = c; uiC64 = uC.ui.v64; uiC0 = uC.ui.v0; return softfloat_mulAddF128( uiA64, uiA0, uiB64, uiB0, uiC64, uiC0, 0 ); } ================================================ FILE: External/SoftFloat-3e/src/f128_rem.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float128_t f128_rem( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool signA; int_fast32_t expA; struct uint128 sigA; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; int_fast32_t expB; struct uint128 sigB; struct exp32_sig128 normExpSig; struct uint128 rem; int_fast32_t expDiff; uint_fast32_t q, recip32; uint_fast64_t q64; struct uint128 term, altRem, meanRem; bool signRem; struct uint128 uiZ; union ui128_f128 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; signA = signF128UI64( uiA64 ); expA = expF128UI64( uiA64 ); sigA.v64 = fracF128UI64( uiA64 ); sigA.v0 = uiA0; uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; expB = expF128UI64( uiB64 ); sigB.v64 = fracF128UI64( uiB64 ); sigB.v0 = uiB0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( (sigA.v64 | sigA.v0) || ((expB == 0x7FFF) && (sigB.v64 | sigB.v0)) ) { goto propagateNaN; } goto invalid; } if ( expB == 0x7FFF ) { if ( sigB.v64 | sigB.v0 ) goto propagateNaN; return a; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expB ) { if ( ! (sigB.v64 | sigB.v0) ) goto invalid; normExpSig = softfloat_normSubnormalF128Sig( sigB.v64, sigB.v0 ); expB = normExpSig.exp; sigB = normExpSig.sig; } if ( ! expA ) { if ( ! (sigA.v64 | sigA.v0) ) return a; normExpSig = softfloat_normSubnormalF128Sig( sigA.v64, sigA.v0 ); expA = normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sigA.v64 |= UINT64_C( 0x0001000000000000 ); sigB.v64 |= UINT64_C( 0x0001000000000000 ); rem = sigA; expDiff = expA - expB; if ( expDiff < 1 ) { if ( expDiff < -1 ) return a; if ( expDiff ) { --expB; sigB = softfloat_add128( sigB.v64, sigB.v0, sigB.v64, sigB.v0 ); q = 0; } else { q = softfloat_le128( sigB.v64, sigB.v0, rem.v64, rem.v0 ); if ( q ) { rem = softfloat_sub128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); } } } else { recip32 = softfloat_approxRecip32_1( sigB.v64>>17 ); expDiff -= 30; for (;;) { q64 = (uint_fast64_t) (uint32_t) (rem.v64>>19) * recip32; if ( expDiff < 0 ) break; q = (q64 + 0x80000000)>>32; rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); term = softfloat_mul128By32( sigB.v64, sigB.v0, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { rem = softfloat_add128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); } expDiff -= 29; } /*-------------------------------------------------------------------- | (`expDiff' cannot be less than -29 here.) *--------------------------------------------------------------------*/ q = (uint32_t) (q64>>32)>>(~expDiff & 31); rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, expDiff + 30 ); term = softfloat_mul128By32( sigB.v64, sigB.v0, q ); rem = softfloat_sub128( rem.v64, rem.v0, term.v64, term.v0 ); if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { altRem = softfloat_add128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); goto selectRem; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ do { altRem = rem; ++q; rem = softfloat_sub128( rem.v64, rem.v0, sigB.v64, sigB.v0 ); } while ( ! (rem.v64 & UINT64_C( 0x8000000000000000 )) ); selectRem: meanRem = softfloat_add128( rem.v64, rem.v0, altRem.v64, altRem.v0 ); if ( (meanRem.v64 & UINT64_C( 0x8000000000000000 )) || (! (meanRem.v64 | meanRem.v0) && (q & 1)) ) { rem = altRem; } signRem = signA; if ( rem.v64 & UINT64_C( 0x8000000000000000 ) ) { signRem = ! signRem; rem = softfloat_sub128( 0, 0, rem.v64, rem.v0 ); } return softfloat_normRoundPackToF128( state, signRem, expB - 1, rem.v64, rem.v0 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNF128UI( state, uiA64, uiA0, uiB64, uiB0 ); goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ.v64 = defaultNaNF128UI64; uiZ.v0 = defaultNaNF128UI0; uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_sqrt.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float128_t f128_sqrt( struct softfloat_state *state, float128_t a ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool signA; int_fast32_t expA; struct uint128 sigA, uiZ; struct exp32_sig128 normExpSig; int_fast32_t expZ; uint_fast32_t sig32A, recipSqrt32, sig32Z; struct uint128 rem; uint32_t qs[3]; uint_fast32_t q; uint_fast64_t x64, sig64Z; struct uint128 y, term; uint_fast64_t sigZExtra; struct uint128 sigZ; union ui128_f128 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; signA = signF128UI64( uiA64 ); expA = expF128UI64( uiA64 ); sigA.v64 = fracF128UI64( uiA64 ); sigA.v0 = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expA == 0x7FFF ) { if ( sigA.v64 | sigA.v0 ) { uiZ = softfloat_propagateNaNF128UI( state, uiA64, uiA0, 0, 0 ); goto uiZ; } if ( ! signA ) return a; goto invalid; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( signA ) { if ( ! (expA | sigA.v64 | sigA.v0) ) return a; goto invalid; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! expA ) { if ( ! (sigA.v64 | sigA.v0) ) return a; normExpSig = softfloat_normSubnormalF128Sig( sigA.v64, sigA.v0 ); expA = normExpSig.exp; sigA = normExpSig.sig; } /*------------------------------------------------------------------------ | (`sig32Z' is guaranteed to be a lower bound on the square root of | `sig32A', which makes `sig32Z' also a lower bound on the square root of | `sigA'.) *------------------------------------------------------------------------*/ expZ = ((expA - 0x3FFF)>>1) + 0x3FFE; expA &= 1; sigA.v64 |= UINT64_C( 0x0001000000000000 ); sig32A = sigA.v64>>17; recipSqrt32 = softfloat_approxRecipSqrt32_1( expA, sig32A ); sig32Z = ((uint_fast64_t) sig32A * recipSqrt32)>>32; if ( expA ) { sig32Z >>= 1; rem = softfloat_shortShiftLeft128( sigA.v64, sigA.v0, 12 ); } else { rem = softfloat_shortShiftLeft128( sigA.v64, sigA.v0, 13 ); } qs[2] = sig32Z; rem.v64 -= (uint_fast64_t) sig32Z * sig32Z; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ q = ((uint32_t) (rem.v64>>2) * (uint_fast64_t) recipSqrt32)>>32; x64 = (uint_fast64_t) sig32Z<<32; sig64Z = x64 + ((uint_fast64_t) q<<3); y = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); /*------------------------------------------------------------------------ | (Repeating this loop is a rare occurrence.) *------------------------------------------------------------------------*/ for (;;) { term = softfloat_mul64ByShifted32To128( x64 + sig64Z, q ); rem = softfloat_sub128( y.v64, y.v0, term.v64, term.v0 ); if ( ! (rem.v64 & UINT64_C( 0x8000000000000000 )) ) break; --q; sig64Z -= 1<<3; } qs[1] = q; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ q = ((rem.v64>>2) * recipSqrt32)>>32; y = softfloat_shortShiftLeft128( rem.v64, rem.v0, 29 ); sig64Z <<= 1; /*------------------------------------------------------------------------ | (Repeating this loop is a rare occurrence.) *------------------------------------------------------------------------*/ for (;;) { term = softfloat_shortShiftLeft128( 0, sig64Z, 32 ); term = softfloat_add128( term.v64, term.v0, 0, (uint_fast64_t) q<<6 ); term = softfloat_mul128By32( term.v64, term.v0, q ); rem = softfloat_sub128( y.v64, y.v0, term.v64, term.v0 ); if ( ! (rem.v64 & UINT64_C( 0x8000000000000000 )) ) break; --q; } qs[0] = q; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ q = (((rem.v64>>2) * recipSqrt32)>>32) + 2; sigZExtra = (uint64_t) ((uint_fast64_t) q<<59); term = softfloat_shortShiftLeft128( 0, qs[1], 53 ); sigZ = softfloat_add128( (uint_fast64_t) qs[2]<<18, ((uint_fast64_t) qs[0]<<24) + (q>>5), term.v64, term.v0 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( (q & 0xF) <= 2 ) { q &= ~3; sigZExtra = (uint64_t) ((uint_fast64_t) q<<59); y = softfloat_shortShiftLeft128( sigZ.v64, sigZ.v0, 6 ); y.v0 |= sigZExtra>>58; term = softfloat_sub128( y.v64, y.v0, 0, q ); y = softfloat_mul64ByShifted32To128( term.v0, q ); term = softfloat_mul64ByShifted32To128( term.v64, q ); term = softfloat_add128( term.v64, term.v0, 0, y.v64 ); rem = softfloat_shortShiftLeft128( rem.v64, rem.v0, 20 ); term = softfloat_sub128( term.v64, term.v0, rem.v64, rem.v0 ); /*-------------------------------------------------------------------- | The concatenation of `term' and `y.v0' is now the negative remainder | (3 words altogether). *--------------------------------------------------------------------*/ if ( term.v64 & UINT64_C( 0x8000000000000000 ) ) { sigZExtra |= 1; } else { if ( term.v64 | term.v0 | y.v0 ) { if ( sigZExtra ) { --sigZExtra; } else { sigZ = softfloat_sub128( sigZ.v64, sigZ.v0, 0, 1 ); sigZExtra = ~0; } } } } return softfloat_roundPackToF128( state, 0, expZ, sigZ.v64, sigZ.v0, sigZExtra ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ.v64 = defaultNaNF128UI64; uiZ.v0 = defaultNaNF128UI0; uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_sub.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" float128_t f128_sub( struct softfloat_state *state, float128_t a, float128_t b ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool signA; union ui128_f128 uB; uint_fast64_t uiB64, uiB0; bool signB; #if ! defined INLINE_LEVEL || (INLINE_LEVEL < 2) float128_t (*magsFuncPtr)( uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, bool ); #endif uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; signA = signF128UI64( uiA64 ); uB.f = b; uiB64 = uB.ui.v64; uiB0 = uB.ui.v0; signB = signF128UI64( uiB64 ); #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) if ( signA == signB ) { return softfloat_subMagsF128( state, uiA64, uiA0, uiB64, uiB0, signA ); } else { return softfloat_addMagsF128( state, uiA64, uiA0, uiB64, uiB0, signA ); } #else magsFuncPtr = (signA == signB) ? softfloat_subMagsF128 : softfloat_addMagsF128; return (*magsFuncPtr)( uiA64, uiA0, uiB64, uiB0, signA ); #endif } ================================================ FILE: External/SoftFloat-3e/src/f128_to_extF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t f128_to_extF80( struct softfloat_state *state, float128_t a ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t frac64, frac0; struct commonNaN commonNaN; struct uint128 uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; struct exp32_sig128 normExpSig; struct uint128 sig128; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); frac64 = fracF128UI64( uiA64 ); frac0 = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FFF ) { if ( frac64 | frac0 ) { softfloat_f128UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToExtF80UI( &commonNaN ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; } else { uiZ64 = packToExtF80UI64( sign, 0x7FFF ); uiZ0 = UINT64_C( 0x8000000000000000 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! exp ) { if ( ! (frac64 | frac0) ) { uiZ64 = packToExtF80UI64( sign, 0 ); uiZ0 = 0; goto uiZ; } normExpSig = softfloat_normSubnormalF128Sig( frac64, frac0 ); exp = normExpSig.exp; frac64 = normExpSig.sig.v64; frac0 = normExpSig.sig.v0; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sig128 = softfloat_shortShiftLeft128( frac64 | UINT64_C( 0x0001000000000000 ), frac0, 15 ); return softfloat_roundPackToExtF80( state, sign, exp, sig128.v64, sig128.v0, 80 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_to_f16.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float16_t f128_to_f16( struct softfloat_state *state, float128_t a ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t frac64; struct commonNaN commonNaN; uint_fast16_t uiZ, frac16; union ui16_f16 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); frac64 = fracF128UI64( uiA64 ) | (uiA0 != 0); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FFF ) { if ( frac64 ) { softfloat_f128UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToF16UI( &commonNaN ); } else { uiZ = packToF16UI( sign, 0x1F, 0 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ frac16 = softfloat_shortShiftRightJam64( frac64, 34 ); if ( ! (exp | frac16) ) { uiZ = packToF16UI( sign, 0, 0 ); goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ exp -= 0x3FF1; if ( sizeof (int_fast16_t) < sizeof (int_fast32_t) ) { if ( exp < -0x40 ) exp = -0x40; } return softfloat_roundPackToF16( sign, exp, frac16 | 0x4000 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_to_f32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float32_t f128_to_f32( struct softfloat_state *state, float128_t a ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t frac64; struct commonNaN commonNaN; uint_fast32_t uiZ, frac32; union ui32_f32 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); frac64 = fracF128UI64( uiA64 ) | (uiA0 != 0); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FFF ) { if ( frac64 ) { softfloat_f128UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToF32UI( &commonNaN ); } else { uiZ = packToF32UI( sign, 0xFF, 0 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ frac32 = softfloat_shortShiftRightJam64( frac64, 18 ); if ( ! (exp | frac32) ) { uiZ = packToF32UI( sign, 0, 0 ); goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ exp -= 0x3F81; if ( sizeof (int_fast16_t) < sizeof (int_fast32_t) ) { if ( exp < -0x1000 ) exp = -0x1000; } return softfloat_roundPackToF32( state, sign, exp, frac32 | 0x40000000 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_to_f64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float64_t f128_to_f64( struct softfloat_state *state, float128_t a ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t frac64, frac0; struct commonNaN commonNaN; uint_fast64_t uiZ; struct uint128 frac128; union ui64_f64 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); frac64 = fracF128UI64( uiA64 ); frac0 = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FFF ) { if ( frac64 | frac0 ) { softfloat_f128UIToCommonNaN( state, uiA64, uiA0, &commonNaN ); uiZ = softfloat_commonNaNToF64UI( &commonNaN ); } else { uiZ = packToF64UI( sign, 0x7FF, 0 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ frac128 = softfloat_shortShiftLeft128( frac64, frac0, 14 ); frac64 = frac128.v64 | (frac128.v0 != 0); if ( ! (exp | frac64) ) { uiZ = packToF64UI( sign, 0, 0 ); goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ exp -= 0x3C01; if ( sizeof (int_fast16_t) < sizeof (int_fast32_t) ) { if ( exp < -0x1000 ) exp = -0x1000; } return softfloat_roundPackToF64( state, sign, exp, frac64 | UINT64_C( 0x4000000000000000 ) ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f128_to_i32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" int_fast32_t f128_to_i32( struct softfloat_state *state, float128_t a, uint_fast8_t roundingMode, bool exact ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t sig64, sig0; int_fast32_t shiftDist; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); sig64 = fracF128UI64( uiA64 ); sig0 = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ #if (i32_fromNaN != i32_fromPosOverflow) || (i32_fromNaN != i32_fromNegOverflow) if ( (exp == 0x7FFF) && (sig64 | sig0) ) { #if (i32_fromNaN == i32_fromPosOverflow) sign = 0; #elif (i32_fromNaN == i32_fromNegOverflow) sign = 1; #else softfloat_raiseFlags( softfloat_flag_invalid ); return i32_fromNaN; #endif } #endif /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp ) sig64 |= UINT64_C( 0x0001000000000000 ); sig64 |= (sig0 != 0); shiftDist = 0x4023 - exp; if ( 0 < shiftDist ) sig64 = softfloat_shiftRightJam64( sig64, shiftDist ); return softfloat_roundToI32( state, sign, sig64, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/f128_to_i64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" int_fast64_t f128_to_i64( struct softfloat_state *state, float128_t a, uint_fast8_t roundingMode, bool exact ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t sig64, sig0; int_fast32_t shiftDist; struct uint128 sig128; struct uint64_extra sigExtra; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); sig64 = fracF128UI64( uiA64 ); sig0 = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ shiftDist = 0x402F - exp; if ( shiftDist <= 0 ) { /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ if ( shiftDist < -15 ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return (exp == 0x7FFF) && (sig64 | sig0) ? i64_fromNaN : sign ? i64_fromNegOverflow : i64_fromPosOverflow; } /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ sig64 |= UINT64_C( 0x0001000000000000 ); if ( shiftDist ) { sig128 = softfloat_shortShiftLeft128( sig64, sig0, -shiftDist ); sig64 = sig128.v64; sig0 = sig128.v0; } } else { /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ if ( exp ) sig64 |= UINT64_C( 0x0001000000000000 ); sigExtra = softfloat_shiftRightJam64Extra( sig64, sig0, shiftDist ); sig64 = sigExtra.v; sig0 = sigExtra.extra; } return softfloat_roundToI64( state, sign, sig64, sig0, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/f128_to_ui32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" uint_fast32_t f128_to_ui32( struct softfloat_state *state, float128_t a, uint_fast8_t roundingMode, bool exact ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t sig64; int_fast32_t shiftDist; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); sig64 = fracF128UI64( uiA64 ) | (uiA0 != 0); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ #if (ui32_fromNaN != ui32_fromPosOverflow) || (ui32_fromNaN != ui32_fromNegOverflow) if ( (exp == 0x7FFF) && sig64 ) { #if (ui32_fromNaN == ui32_fromPosOverflow) sign = 0; #elif (ui32_fromNaN == ui32_fromNegOverflow) sign = 1; #else softfloat_raiseFlags( softfloat_flag_invalid ); return ui32_fromNaN; #endif } #endif /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp ) sig64 |= UINT64_C( 0x0001000000000000 ); shiftDist = 0x4023 - exp; if ( 0 < shiftDist ) { sig64 = softfloat_shiftRightJam64( sig64, shiftDist ); } return softfloat_roundToUI32( sign, sig64, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/f128_to_ui64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" uint_fast64_t f128_to_ui64( struct softfloat_state *state, float128_t a, uint_fast8_t roundingMode, bool exact ) { union ui128_f128 uA; uint_fast64_t uiA64, uiA0; bool sign; int_fast32_t exp; uint_fast64_t sig64, sig0; int_fast32_t shiftDist; struct uint128 sig128; struct uint64_extra sigExtra; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA64 = uA.ui.v64; uiA0 = uA.ui.v0; sign = signF128UI64( uiA64 ); exp = expF128UI64( uiA64 ); sig64 = fracF128UI64( uiA64 ); sig0 = uiA0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ shiftDist = 0x402F - exp; if ( shiftDist <= 0 ) { /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ if ( shiftDist < -15 ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); return (exp == 0x7FFF) && (sig64 | sig0) ? ui64_fromNaN : sign ? ui64_fromNegOverflow : ui64_fromPosOverflow; } /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ sig64 |= UINT64_C( 0x0001000000000000 ); if ( shiftDist ) { sig128 = softfloat_shortShiftLeft128( sig64, sig0, -shiftDist ); sig64 = sig128.v64; sig0 = sig128.v0; } } else { /*-------------------------------------------------------------------- *--------------------------------------------------------------------*/ if ( exp ) sig64 |= UINT64_C( 0x0001000000000000 ); sigExtra = softfloat_shiftRightJam64Extra( sig64, sig0, shiftDist ); sig64 = sigExtra.v; sig0 = sigExtra.extra; } return softfloat_roundToUI64( state, sign, sig64, sig0, roundingMode, exact ); } ================================================ FILE: External/SoftFloat-3e/src/f32_to_extF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t f32_to_extF80( struct softfloat_state *state, float32_t a ) { union ui32_f32 uA; uint_fast32_t uiA; bool sign; int_fast16_t exp; uint_fast32_t frac; struct commonNaN commonNaN; struct uint128 uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; struct exp16_sig32 normExpSig; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA = uA.ui; sign = signF32UI( uiA ); exp = expF32UI( uiA ); frac = fracF32UI( uiA ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0xFF ) { if ( frac ) { softfloat_f32UIToCommonNaN( state, uiA, &commonNaN ); uiZ = softfloat_commonNaNToExtF80UI( &commonNaN ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; } else { uiZ64 = packToExtF80UI64( sign, 0x7FFF ); uiZ0 = UINT64_C( 0x8000000000000000 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! exp ) { if ( ! frac ) { uiZ64 = packToExtF80UI64( sign, 0 ); uiZ0 = 0; goto uiZ; } normExpSig = softfloat_normSubnormalF32Sig( frac ); exp = normExpSig.exp; frac = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ64 = packToExtF80UI64( sign, exp + 0x3F80 ); uiZ0 = (uint_fast64_t) (frac | 0x00800000)<<40; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f32_to_f128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float128_t f32_to_f128( struct softfloat_state *state, float32_t a ) { union ui32_f32 uA; uint_fast32_t uiA; bool sign; int_fast16_t exp; uint_fast32_t frac; struct commonNaN commonNaN; struct uint128 uiZ; struct exp16_sig32 normExpSig; union ui128_f128 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA = uA.ui; sign = signF32UI( uiA ); exp = expF32UI( uiA ); frac = fracF32UI( uiA ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0xFF ) { if ( frac ) { softfloat_f32UIToCommonNaN( state, uiA, &commonNaN ); uiZ = softfloat_commonNaNToF128UI( &commonNaN ); } else { uiZ.v64 = packToF128UI64( sign, 0x7FFF, 0 ); uiZ.v0 = 0; } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! exp ) { if ( ! frac ) { uiZ.v64 = packToF128UI64( sign, 0, 0 ); uiZ.v0 = 0; goto uiZ; } normExpSig = softfloat_normSubnormalF32Sig( frac ); exp = normExpSig.exp - 1; frac = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ.v64 = packToF128UI64( sign, exp + 0x3F80, (uint_fast64_t) frac<<25 ); uiZ.v0 = 0; uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/f64_to_extF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t f64_to_extF80( struct softfloat_state *state, float64_t a ) { union ui64_f64 uA; uint_fast64_t uiA; bool sign; int_fast16_t exp; uint_fast64_t frac; struct commonNaN commonNaN; struct uint128 uiZ; uint_fast16_t uiZ64; uint_fast64_t uiZ0; struct exp16_sig64 normExpSig; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uA.f = a; uiA = uA.ui; sign = signF64UI( uiA ); exp = expF64UI( uiA ); frac = fracF64UI( uiA ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( exp == 0x7FF ) { if ( frac ) { softfloat_f64UIToCommonNaN( state, uiA, &commonNaN ); uiZ = softfloat_commonNaNToExtF80UI( &commonNaN ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; } else { uiZ64 = packToExtF80UI64( sign, 0x7FFF ); uiZ0 = UINT64_C( 0x8000000000000000 ); } goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( ! exp ) { if ( ! frac ) { uiZ64 = packToExtF80UI64( sign, 0 ); uiZ0 = 0; goto uiZ; } normExpSig = softfloat_normSubnormalF64Sig( frac ); exp = normExpSig.exp; frac = normExpSig.sig; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ64 = packToExtF80UI64( sign, exp + 0x3C00 ); uiZ0 = (frac | UINT64_C( 0x0010000000000000 ))<<11; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/i32_to_extF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t i32_to_extF80( int32_t a ) { uint_fast16_t uiZ64; uint_fast32_t absA; bool sign; int_fast8_t shiftDist; union { struct extFloat80M s; extFloat80_t f; } uZ; uiZ64 = 0; absA = 0; if ( a ) { sign = (a < 0); absA = sign ? -(uint_fast32_t) a : (uint_fast32_t) a; shiftDist = softfloat_countLeadingZeros32( absA ); uiZ64 = packToExtF80UI64( sign, 0x401E - shiftDist ); absA <<= shiftDist; } uZ.s.signExp = uiZ64; uZ.s.signif = (uint_fast64_t) absA<<32; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/i32_to_f128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "internals.h" #include "softfloat.h" float128_t i32_to_f128( int32_t a ) { uint_fast64_t uiZ64; bool sign; uint_fast32_t absA; int_fast8_t shiftDist; union ui128_f128 uZ; uiZ64 = 0; if ( a ) { sign = (a < 0); absA = sign ? -(uint_fast32_t) a : (uint_fast32_t) a; shiftDist = softfloat_countLeadingZeros32( absA ) + 17; uiZ64 = packToF128UI64( sign, 0x402E - shiftDist, (uint_fast64_t) absA< #include #include "primitives.h" #include "softfloat_types.h" union ui16_f16 { uint16_t ui; float16_t f; }; union ui32_f32 { uint32_t ui; float32_t f; }; union ui64_f64 { uint64_t ui; float64_t f; }; #ifdef SOFTFLOAT_FAST_INT64 union extF80M_extF80 { struct extFloat80M fM; extFloat80_t f; }; union ui128_f128 { struct uint128 ui; float128_t f; }; #endif enum { softfloat_mulAdd_subC = 1, softfloat_mulAdd_subProd = 2 }; /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ uint_fast32_t softfloat_roundToUI32( bool, uint_fast64_t, uint_fast8_t, bool ); #ifdef SOFTFLOAT_FAST_INT64 uint_fast64_t softfloat_roundToUI64( struct softfloat_state *, bool, uint_fast64_t, uint_fast64_t, uint_fast8_t, bool ); #else uint_fast64_t softfloat_roundMToUI64( bool, uint32_t *, uint_fast8_t, bool ); #endif FEXCORE_PRESERVE_ALL_ATTR int_fast32_t softfloat_roundToI32( struct softfloat_state *, bool, uint_fast64_t, uint_fast8_t, bool ); #ifdef SOFTFLOAT_FAST_INT64 FEXCORE_PRESERVE_ALL_ATTR int_fast64_t softfloat_roundToI64( struct softfloat_state *, bool, uint_fast64_t, uint_fast64_t, uint_fast8_t, bool ); #else int_fast64_t softfloat_roundMToI64( bool, uint32_t *, uint_fast8_t, bool ); #endif /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define signF16UI( a ) ((bool) ((uint16_t) (a)>>15)) #define expF16UI( a ) ((int_fast8_t) ((a)>>10) & 0x1F) #define fracF16UI( a ) ((a) & 0x03FF) #define packToF16UI( sign, exp, sig ) (((uint16_t) (sign)<<15) + ((uint16_t) (exp)<<10) + (sig)) #define isNaNF16UI( a ) (((~(a) & 0x7C00) == 0) && ((a) & 0x03FF)) struct exp8_sig16 { int_fast8_t exp; uint_fast16_t sig; }; struct exp8_sig16 softfloat_normSubnormalF16Sig( uint_fast16_t ); float16_t softfloat_roundPackToF16( bool, int_fast16_t, uint_fast16_t ); float16_t softfloat_normRoundPackToF16( bool, int_fast16_t, uint_fast16_t ); float16_t softfloat_addMagsF16( uint_fast16_t, uint_fast16_t ); float16_t softfloat_subMagsF16( uint_fast16_t, uint_fast16_t ); float16_t softfloat_mulAddF16( uint_fast16_t, uint_fast16_t, uint_fast16_t, uint_fast8_t ); /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define signF32UI( a ) ((bool) ((uint32_t) (a)>>31)) #define expF32UI( a ) ((int_fast16_t) ((a)>>23) & 0xFF) #define fracF32UI( a ) ((a) & 0x007FFFFF) #define packToF32UI( sign, exp, sig ) (((uint32_t) (sign)<<31) + ((uint32_t) (exp)<<23) + (sig)) #define isNaNF32UI( a ) (((~(a) & 0x7F800000) == 0) && ((a) & 0x007FFFFF)) struct exp16_sig32 { int_fast16_t exp; uint_fast32_t sig; }; FEXCORE_PRESERVE_ALL_ATTR struct exp16_sig32 softfloat_normSubnormalF32Sig( uint_fast32_t ); FEXCORE_PRESERVE_ALL_ATTR float32_t softfloat_roundPackToF32( struct softfloat_state *, bool, int_fast16_t, uint_fast32_t ); float32_t softfloat_normRoundPackToF32( bool, int_fast16_t, uint_fast32_t ); float32_t softfloat_addMagsF32( uint_fast32_t, uint_fast32_t ); float32_t softfloat_subMagsF32( uint_fast32_t, uint_fast32_t ); float32_t softfloat_mulAddF32( uint_fast32_t, uint_fast32_t, uint_fast32_t, uint_fast8_t ); /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define signF64UI( a ) ((bool) ((uint64_t) (a)>>63)) #define expF64UI( a ) ((int_fast16_t) ((a)>>52) & 0x7FF) #define fracF64UI( a ) ((a) & UINT64_C( 0x000FFFFFFFFFFFFF )) #define packToF64UI( sign, exp, sig ) ((uint64_t) (((uint_fast64_t) (sign)<<63) + ((uint_fast64_t) (exp)<<52) + (sig))) #define isNaNF64UI( a ) (((~(a) & UINT64_C( 0x7FF0000000000000 )) == 0) && ((a) & UINT64_C( 0x000FFFFFFFFFFFFF ))) struct exp16_sig64 { int_fast16_t exp; uint_fast64_t sig; }; FEXCORE_PRESERVE_ALL_ATTR struct exp16_sig64 softfloat_normSubnormalF64Sig( uint_fast64_t ); FEXCORE_PRESERVE_ALL_ATTR float64_t softfloat_roundPackToF64( struct softfloat_state *, bool, int_fast16_t, uint_fast64_t ); float64_t softfloat_normRoundPackToF64( bool, int_fast16_t, uint_fast64_t ); float64_t softfloat_addMagsF64( uint_fast64_t, uint_fast64_t, bool ); float64_t softfloat_subMagsF64( uint_fast64_t, uint_fast64_t, bool ); float64_t softfloat_mulAddF64( uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast8_t ); /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define signExtF80UI64( a64 ) ((bool) ((uint16_t) (a64)>>15)) #define expExtF80UI64( a64 ) ((a64) & 0x7FFF) #define packToExtF80UI64( sign, exp ) ((uint_fast16_t) (sign)<<15 | (exp)) #define isNaNExtF80UI( a64, a0 ) ((((a64) & 0x7FFF) == 0x7FFF) && ((a0) & UINT64_C( 0x7FFFFFFFFFFFFFFF ))) #ifdef SOFTFLOAT_FAST_INT64 /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ struct exp32_sig64 { int_fast32_t exp; uint64_t sig; }; FEXCORE_PRESERVE_ALL_ATTR struct exp32_sig64 softfloat_normSubnormalExtF80Sig( uint_fast64_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t softfloat_roundPackToExtF80( struct softfloat_state *, bool, int_fast32_t, uint_fast64_t, uint_fast64_t, uint_fast8_t ); FEXCORE_PRESERVE_ALL_ATTR extFloat80_t softfloat_normRoundPackToExtF80( struct softfloat_state *, bool, int_fast32_t, uint_fast64_t, uint_fast64_t, uint_fast8_t ); extFloat80_t softfloat_addMagsExtF80( struct softfloat_state *, uint_fast16_t, uint_fast64_t, uint_fast16_t, uint_fast64_t, bool ); extFloat80_t softfloat_subMagsExtF80( struct softfloat_state *, uint_fast16_t, uint_fast64_t, uint_fast16_t, uint_fast64_t, bool ); /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define signF128UI64( a64 ) ((bool) ((uint64_t) (a64)>>63)) #define expF128UI64( a64 ) ((int_fast32_t) ((a64)>>48) & 0x7FFF) #define fracF128UI64( a64 ) ((a64) & UINT64_C( 0x0000FFFFFFFFFFFF )) #define packToF128UI64( sign, exp, sig64 ) (((uint_fast64_t) (sign)<<63) + ((uint_fast64_t) (exp)<<48) + (sig64)) #define isNaNF128UI( a64, a0 ) (((~(a64) & UINT64_C( 0x7FFF000000000000 )) == 0) && (a0 || ((a64) & UINT64_C( 0x0000FFFFFFFFFFFF )))) struct exp32_sig128 { int_fast32_t exp; struct uint128 sig; }; FEXCORE_PRESERVE_ALL_ATTR struct exp32_sig128 softfloat_normSubnormalF128Sig( uint_fast64_t, uint_fast64_t ); float128_t softfloat_roundPackToF128( struct softfloat_state *, bool, int_fast32_t, uint_fast64_t, uint_fast64_t, uint_fast64_t ); float128_t softfloat_normRoundPackToF128( struct softfloat_state *, bool, int_fast32_t, uint_fast64_t, uint_fast64_t ); float128_t softfloat_addMagsF128( struct softfloat_state *, uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, bool ); float128_t softfloat_subMagsF128( struct softfloat_state *, uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, bool ); float128_t softfloat_mulAddF128( uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast64_t, uint_fast8_t ); #else /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ bool softfloat_tryPropagateNaNExtF80M( const struct extFloat80M *, const struct extFloat80M *, struct extFloat80M * ); void softfloat_invalidExtF80M( struct extFloat80M * ); int softfloat_normExtF80SigM( uint64_t * ); void softfloat_roundPackMToExtF80M( bool, int32_t, uint32_t *, uint_fast8_t, struct extFloat80M * ); void softfloat_normRoundPackMToExtF80M( bool, int32_t, uint32_t *, uint_fast8_t, struct extFloat80M * ); void softfloat_addExtF80M( const struct extFloat80M *, const struct extFloat80M *, struct extFloat80M *, bool ); int softfloat_compareNonnormExtF80M( const struct extFloat80M *, const struct extFloat80M * ); /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ #define signF128UI96( a96 ) ((bool) ((uint32_t) (a96)>>31)) #define expF128UI96( a96 ) ((int32_t) ((a96)>>16) & 0x7FFF) #define fracF128UI96( a96 ) ((a96) & 0x0000FFFF) #define packToF128UI96( sign, exp, sig96 ) (((uint32_t) (sign)<<31) + ((uint32_t) (exp)<<16) + (sig96)) bool softfloat_isNaNF128M( const uint32_t * ); bool softfloat_tryPropagateNaNF128M( const uint32_t *, const uint32_t *, uint32_t * ); void softfloat_invalidF128M( uint32_t * ); int softfloat_shiftNormSigF128M( const uint32_t *, uint_fast8_t, uint32_t * ); void softfloat_roundPackMToF128M( bool, int32_t, uint32_t *, uint32_t * ); void softfloat_normRoundPackMToF128M( bool, int32_t, uint32_t *, uint32_t * ); void softfloat_addF128M( const uint32_t *, const uint32_t *, uint32_t *, bool ); void softfloat_mulAddF128M( const uint32_t *, const uint32_t *, const uint32_t *, uint32_t *, uint_fast8_t ); #endif #endif ================================================ FILE: External/SoftFloat-3e/src/primitives.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #ifndef primitives_h #define primitives_h 1 #include #include #include "primitiveTypes.h" #ifndef softfloat_shortShiftRightJam64 /*---------------------------------------------------------------------------- | Shifts 'a' right by the number of bits given in 'dist', which must be in | the range 1 to 63. If any nonzero bits are shifted off, they are "jammed" | into the least-significant bit of the shifted value by setting the least- | significant bit to 1. This shifted-and-jammed value is returned. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE uint64_t softfloat_shortShiftRightJam64( uint64_t a, uint_fast8_t dist ) { return a>>dist | ((a & (((uint_fast64_t) 1<>dist | ((uint32_t) (a<<(-dist & 31)) != 0) : (a != 0); } #else FEXCORE_PRESERVE_ALL_ATTR uint32_t softfloat_shiftRightJam32( uint32_t a, uint_fast16_t dist ); #endif #endif #ifndef softfloat_shiftRightJam64 /*---------------------------------------------------------------------------- | Shifts 'a' right by the number of bits given in 'dist', which must not | be zero. If any nonzero bits are shifted off, they are "jammed" into the | least-significant bit of the shifted value by setting the least-significant | bit to 1. This shifted-and-jammed value is returned. | The value of 'dist' can be arbitrarily large. In particular, if 'dist' is | greater than 64, the result will be either 0 or 1, depending on whether 'a' | is zero or nonzero. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (3 <= INLINE_LEVEL) INLINE uint64_t softfloat_shiftRightJam64( uint64_t a, uint_fast32_t dist ) { return (dist < 63) ? a>>dist | ((uint64_t) (a<<(-dist & 63)) != 0) : (a != 0); } #else FEXCORE_PRESERVE_ALL_ATTR uint64_t softfloat_shiftRightJam64( uint64_t a, uint_fast32_t dist ); #endif #endif /*---------------------------------------------------------------------------- | A constant table that translates an 8-bit unsigned integer (the array index) | into the number of leading 0 bits before the most-significant 1 of that | integer. For integer zero (index 0), the corresponding table element is 8. *----------------------------------------------------------------------------*/ extern const uint_least8_t softfloat_countLeadingZeros8[256]; #ifndef softfloat_countLeadingZeros16 /*---------------------------------------------------------------------------- | Returns the number of leading 0 bits before the most-significant 1 bit of | 'a'. If 'a' is zero, 16 is returned. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE uint_fast8_t softfloat_countLeadingZeros16( uint16_t a ) { uint_fast8_t count = 8; if ( 0x100 <= a ) { count = 0; a >>= 8; } count += softfloat_countLeadingZeros8[a]; return count; } #else uint_fast8_t softfloat_countLeadingZeros16( uint16_t a ); #endif #endif #ifndef softfloat_countLeadingZeros32 /*---------------------------------------------------------------------------- | Returns the number of leading 0 bits before the most-significant 1 bit of | 'a'. If 'a' is zero, 32 is returned. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (3 <= INLINE_LEVEL) INLINE uint_fast8_t softfloat_countLeadingZeros32( uint32_t a ) { uint_fast8_t count = 0; if ( a < 0x10000 ) { count = 16; a <<= 16; } if ( a < 0x1000000 ) { count += 8; a <<= 8; } count += softfloat_countLeadingZeros8[a>>24]; return count; } #else FEXCORE_PRESERVE_ALL_ATTR uint_fast8_t softfloat_countLeadingZeros32( uint32_t a ); #endif #endif #ifndef softfloat_countLeadingZeros64 /*---------------------------------------------------------------------------- | Returns the number of leading 0 bits before the most-significant 1 bit of | 'a'. If 'a' is zero, 64 is returned. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR uint_fast8_t softfloat_countLeadingZeros64( uint64_t a ); #endif extern const uint16_t softfloat_approxRecip_1k0s[16]; extern const uint16_t softfloat_approxRecip_1k1s[16]; #ifndef softfloat_approxRecip32_1 /*---------------------------------------------------------------------------- | Returns an approximation to the reciprocal of the number represented by 'a', | where 'a' is interpreted as an unsigned fixed-point number with one integer | bit and 31 fraction bits. The 'a' input must be "normalized", meaning that | its most-significant bit (bit 31) must be 1. Thus, if A is the value of | the fixed-point interpretation of 'a', then 1 <= A < 2. The returned value | is interpreted as a pure unsigned fraction, having no integer bits and 32 | fraction bits. The approximation returned is never greater than the true | reciprocal 1/A, and it differs from the true reciprocal by at most 2.006 ulp | (units in the last place). *----------------------------------------------------------------------------*/ #ifdef SOFTFLOAT_FAST_DIV64TO32 #define softfloat_approxRecip32_1( a ) ((uint32_t) (UINT64_C( 0x7FFFFFFFFFFFFFFF ) / (uint32_t) (a))) #else FEXCORE_PRESERVE_ALL_ATTR uint32_t softfloat_approxRecip32_1( uint32_t a ); #endif #endif extern const uint16_t softfloat_approxRecipSqrt_1k0s[16]; extern const uint16_t softfloat_approxRecipSqrt_1k1s[16]; #ifndef softfloat_approxRecipSqrt32_1 /*---------------------------------------------------------------------------- | Returns an approximation to the reciprocal of the square root of the number | represented by 'a', where 'a' is interpreted as an unsigned fixed-point | number either with one integer bit and 31 fraction bits or with two integer | bits and 30 fraction bits. The format of 'a' is determined by 'oddExpA', | which must be either 0 or 1. If 'oddExpA' is 1, 'a' is interpreted as | having one integer bit, and if 'oddExpA' is 0, 'a' is interpreted as having | two integer bits. The 'a' input must be "normalized", meaning that its | most-significant bit (bit 31) must be 1. Thus, if A is the value of the | fixed-point interpretation of 'a', it follows that 1 <= A < 2 when 'oddExpA' | is 1, and 2 <= A < 4 when 'oddExpA' is 0. | The returned value is interpreted as a pure unsigned fraction, having | no integer bits and 32 fraction bits. The approximation returned is never | greater than the true reciprocal 1/sqrt(A), and it differs from the true | reciprocal by at most 2.06 ulp (units in the last place). The approximation | returned is also always within the range 0.5 to 1; thus, the most- | significant bit of the result is always set. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR uint32_t softfloat_approxRecipSqrt32_1( unsigned int oddExpA, uint32_t a ); #endif #ifdef SOFTFLOAT_FAST_INT64 /*---------------------------------------------------------------------------- | The following functions are needed only when 'SOFTFLOAT_FAST_INT64' is | defined. *----------------------------------------------------------------------------*/ #ifndef softfloat_eq128 /*---------------------------------------------------------------------------- | Returns true if the 128-bit unsigned integer formed by concatenating 'a64' | and 'a0' is equal to the 128-bit unsigned integer formed by concatenating | 'b64' and 'b0'. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (1 <= INLINE_LEVEL) INLINE bool softfloat_eq128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { return (a64 == b64) && (a0 == b0); } #else bool softfloat_eq128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ); #endif #endif #ifndef softfloat_le128 /*---------------------------------------------------------------------------- | Returns true if the 128-bit unsigned integer formed by concatenating 'a64' | and 'a0' is less than or equal to the 128-bit unsigned integer formed by | concatenating 'b64' and 'b0'. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE bool softfloat_le128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { return (a64 < b64) || ((a64 == b64) && (a0 <= b0)); } #else FEXCORE_PRESERVE_ALL_ATTR bool softfloat_le128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ); #endif #endif #ifndef softfloat_lt128 /*---------------------------------------------------------------------------- | Returns true if the 128-bit unsigned integer formed by concatenating 'a64' | and 'a0' is less than the 128-bit unsigned integer formed by concatenating | 'b64' and 'b0'. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE bool softfloat_lt128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { return (a64 < b64) || ((a64 == b64) && (a0 < b0)); } #else FEXCORE_PRESERVE_ALL_ATTR bool softfloat_lt128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ); #endif #endif #ifndef softfloat_shortShiftLeft128 /*---------------------------------------------------------------------------- | Shifts the 128 bits formed by concatenating 'a64' and 'a0' left by the | number of bits given in 'dist', which must be in the range 1 to 63. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE struct uint128 softfloat_shortShiftLeft128( uint64_t a64, uint64_t a0, uint_fast8_t dist ) { struct uint128 z; z.v64 = a64<>(-dist & 63); z.v0 = a0<>dist; z.v0 = a64<<(-dist & 63) | a0>>dist; return z; } #else FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_shortShiftRight128( uint64_t a64, uint64_t a0, uint_fast8_t dist ); #endif #endif #ifndef softfloat_shortShiftRightJam64Extra /*---------------------------------------------------------------------------- | This function is the same as 'softfloat_shiftRightJam64Extra' (below), | except that 'dist' must be in the range 1 to 63. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE struct uint64_extra softfloat_shortShiftRightJam64Extra( uint64_t a, uint64_t extra, uint_fast8_t dist ) { struct uint64_extra z; z.v = a>>dist; z.extra = a<<(-dist & 63) | (extra != 0); return z; } #else struct uint64_extra softfloat_shortShiftRightJam64Extra( uint64_t a, uint64_t extra, uint_fast8_t dist ); #endif #endif #ifndef softfloat_shortShiftRightJam128 /*---------------------------------------------------------------------------- | Shifts the 128 bits formed by concatenating 'a64' and 'a0' right by the | number of bits given in 'dist', which must be in the range 1 to 63. If any | nonzero bits are shifted off, they are "jammed" into the least-significant | bit of the shifted value by setting the least-significant bit to 1. This | shifted-and-jammed value is returned. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (3 <= INLINE_LEVEL) INLINE struct uint128 softfloat_shortShiftRightJam128( uint64_t a64, uint64_t a0, uint_fast8_t dist ) { uint_fast8_t negDist = -dist; struct uint128 z; z.v64 = a64>>dist; z.v0 = a64<<(negDist & 63) | a0>>dist | ((uint64_t) (a0<<(negDist & 63)) != 0); return z; } #else struct uint128 softfloat_shortShiftRightJam128( uint64_t a64, uint64_t a0, uint_fast8_t dist ); #endif #endif #ifndef softfloat_shortShiftRightJam128Extra /*---------------------------------------------------------------------------- | This function is the same as 'softfloat_shiftRightJam128Extra' (below), | except that 'dist' must be in the range 1 to 63. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (3 <= INLINE_LEVEL) INLINE struct uint128_extra softfloat_shortShiftRightJam128Extra( uint64_t a64, uint64_t a0, uint64_t extra, uint_fast8_t dist ) { uint_fast8_t negDist = -dist; struct uint128_extra z; z.v.v64 = a64>>dist; z.v.v0 = a64<<(negDist & 63) | a0>>dist; z.extra = a0<<(negDist & 63) | (extra != 0); return z; } #else struct uint128_extra softfloat_shortShiftRightJam128Extra( uint64_t a64, uint64_t a0, uint64_t extra, uint_fast8_t dist ); #endif #endif #ifndef softfloat_shiftRightJam64Extra /*---------------------------------------------------------------------------- | Shifts the 128 bits formed by concatenating 'a' and 'extra' right by 64 | _plus_ the number of bits given in 'dist', which must not be zero. This | shifted value is at most 64 nonzero bits and is returned in the 'v' field | of the 'struct uint64_extra' result. The 64-bit 'extra' field of the result | contains a value formed as follows from the bits that were shifted off: The | _last_ bit shifted off is the most-significant bit of the 'extra' field, and | the other 63 bits of the 'extra' field are all zero if and only if _all_but_ | _the_last_ bits shifted off were all zero. | (This function makes more sense if 'a' and 'extra' are considered to form | an unsigned fixed-point number with binary point between 'a' and 'extra'. | This fixed-point value is shifted right by the number of bits given in | 'dist', and the integer part of this shifted value is returned in the 'v' | field of the result. The fractional part of the shifted value is modified | as described above and returned in the 'extra' field of the result.) *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (4 <= INLINE_LEVEL) INLINE struct uint64_extra softfloat_shiftRightJam64Extra( uint64_t a, uint64_t extra, uint_fast32_t dist ) { struct uint64_extra z; if ( dist < 64 ) { z.v = a>>dist; z.extra = a<<(-dist & 63); } else { z.v = 0; z.extra = (dist == 64) ? a : (a != 0); } z.extra |= (extra != 0); return z; } #else FEXCORE_PRESERVE_ALL_ATTR struct uint64_extra softfloat_shiftRightJam64Extra( uint64_t a, uint64_t extra, uint_fast32_t dist ); #endif #endif #ifndef softfloat_shiftRightJam128 /*---------------------------------------------------------------------------- | Shifts the 128 bits formed by concatenating 'a64' and 'a0' right by the | number of bits given in 'dist', which must not be zero. If any nonzero bits | are shifted off, they are "jammed" into the least-significant bit of the | shifted value by setting the least-significant bit to 1. This shifted-and- | jammed value is returned. | The value of 'dist' can be arbitrarily large. In particular, if 'dist' is | greater than 128, the result will be either 0 or 1, depending on whether the | original 128 bits are all zeros. *----------------------------------------------------------------------------*/ struct uint128 softfloat_shiftRightJam128( uint64_t a64, uint64_t a0, uint_fast32_t dist ); #endif #ifndef softfloat_shiftRightJam128Extra /*---------------------------------------------------------------------------- | Shifts the 192 bits formed by concatenating 'a64', 'a0', and 'extra' right | by 64 _plus_ the number of bits given in 'dist', which must not be zero. | This shifted value is at most 128 nonzero bits and is returned in the 'v' | field of the 'struct uint128_extra' result. The 64-bit 'extra' field of the | result contains a value formed as follows from the bits that were shifted | off: The _last_ bit shifted off is the most-significant bit of the 'extra' | field, and the other 63 bits of the 'extra' field are all zero if and only | if _all_but_the_last_ bits shifted off were all zero. | (This function makes more sense if 'a64', 'a0', and 'extra' are considered | to form an unsigned fixed-point number with binary point between 'a0' and | 'extra'. This fixed-point value is shifted right by the number of bits | given in 'dist', and the integer part of this shifted value is returned | in the 'v' field of the result. The fractional part of the shifted value | is modified as described above and returned in the 'extra' field of the | result.) *----------------------------------------------------------------------------*/ struct uint128_extra softfloat_shiftRightJam128Extra( uint64_t a64, uint64_t a0, uint64_t extra, uint_fast32_t dist ); #endif #ifndef softfloat_shiftRightJam256M /*---------------------------------------------------------------------------- | Shifts the 256-bit unsigned integer pointed to by 'aPtr' right by the number | of bits given in 'dist', which must not be zero. If any nonzero bits are | shifted off, they are "jammed" into the least-significant bit of the shifted | value by setting the least-significant bit to 1. This shifted-and-jammed | value is stored at the location pointed to by 'zPtr'. Each of 'aPtr' and | 'zPtr' points to an array of four 64-bit elements that concatenate in the | platform's normal endian order to form a 256-bit integer. | The value of 'dist' can be arbitrarily large. In particular, if 'dist' | is greater than 256, the stored result will be either 0 or 1, depending on | whether the original 256 bits are all zeros. *----------------------------------------------------------------------------*/ void softfloat_shiftRightJam256M( const uint64_t *aPtr, uint_fast32_t dist, uint64_t *zPtr ); #endif #ifndef softfloat_add128 /*---------------------------------------------------------------------------- | Returns the sum of the 128-bit integer formed by concatenating 'a64' and | 'a0' and the 128-bit integer formed by concatenating 'b64' and 'b0'. The | addition is modulo 2^128, so any carry out is lost. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE struct uint128 softfloat_add128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { struct uint128 z; z.v0 = a0 + b0; z.v64 = a64 + b64 + (z.v0 < a0); return z; } #else FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_add128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ); #endif #endif #ifndef softfloat_add256M /*---------------------------------------------------------------------------- | Adds the two 256-bit integers pointed to by 'aPtr' and 'bPtr'. The addition | is modulo 2^256, so any carry out is lost. The sum is stored at the | location pointed to by 'zPtr'. Each of 'aPtr', 'bPtr', and 'zPtr' points to | an array of four 64-bit elements that concatenate in the platform's normal | endian order to form a 256-bit integer. *----------------------------------------------------------------------------*/ void softfloat_add256M( const uint64_t *aPtr, const uint64_t *bPtr, uint64_t *zPtr ); #endif #ifndef softfloat_sub128 /*---------------------------------------------------------------------------- | Returns the difference of the 128-bit integer formed by concatenating 'a64' | and 'a0' and the 128-bit integer formed by concatenating 'b64' and 'b0'. | The subtraction is modulo 2^128, so any borrow out (carry out) is lost. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE struct uint128 softfloat_sub128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { struct uint128 z; z.v0 = a0 - b0; z.v64 = a64 - b64; z.v64 -= (a0 < b0); return z; } #else FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_sub128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ); #endif #endif #ifndef softfloat_sub256M /*---------------------------------------------------------------------------- | Subtracts the 256-bit integer pointed to by 'bPtr' from the 256-bit integer | pointed to by 'aPtr'. The addition is modulo 2^256, so any borrow out | (carry out) is lost. The difference is stored at the location pointed to | by 'zPtr'. Each of 'aPtr', 'bPtr', and 'zPtr' points to an array of four | 64-bit elements that concatenate in the platform's normal endian order to | form a 256-bit integer. *----------------------------------------------------------------------------*/ void softfloat_sub256M( const uint64_t *aPtr, const uint64_t *bPtr, uint64_t *zPtr ); #endif #ifndef softfloat_mul64ByShifted32To128 /*---------------------------------------------------------------------------- | Returns the 128-bit product of 'a', 'b', and 2^32. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (3 <= INLINE_LEVEL) INLINE struct uint128 softfloat_mul64ByShifted32To128( uint64_t a, uint32_t b ) { uint_fast64_t mid; struct uint128 z; mid = (uint_fast64_t) (uint32_t) a * b; z.v0 = mid<<32; z.v64 = (uint_fast64_t) (uint32_t) (a>>32) * b + (mid>>32); return z; } #else FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_mul64ByShifted32To128( uint64_t a, uint32_t b ); #endif #endif #ifndef softfloat_mul64To128 /*---------------------------------------------------------------------------- | Returns the 128-bit product of 'a' and 'b'. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ); #endif #ifndef softfloat_mul128By32 /*---------------------------------------------------------------------------- | Returns the product of the 128-bit integer formed by concatenating 'a64' and | 'a0', multiplied by 'b'. The multiplication is modulo 2^128; any overflow | bits are discarded. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (4 <= INLINE_LEVEL) INLINE struct uint128 softfloat_mul128By32( uint64_t a64, uint64_t a0, uint32_t b ) { struct uint128 z; uint_fast64_t mid; uint_fast32_t carry; z.v0 = a0 * b; mid = (uint_fast64_t) (uint32_t) (a0>>32) * b; carry = (uint32_t) ((uint_fast32_t) (z.v0>>32) - (uint_fast32_t) mid); z.v64 = a64 * b + (uint_fast32_t) ((mid + carry)>>32); return z; } #else struct uint128 softfloat_mul128By32( uint64_t a64, uint64_t a0, uint32_t b ); #endif #endif #ifndef softfloat_mul128To256M /*---------------------------------------------------------------------------- | Multiplies the 128-bit unsigned integer formed by concatenating 'a64' and | 'a0' by the 128-bit unsigned integer formed by concatenating 'b64' and | 'b0'. The 256-bit product is stored at the location pointed to by 'zPtr'. | Argument 'zPtr' points to an array of four 64-bit elements that concatenate | in the platform's normal endian order to form a 256-bit integer. *----------------------------------------------------------------------------*/ void softfloat_mul128To256M( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0, uint64_t *zPtr ); #endif #else /*---------------------------------------------------------------------------- | The following functions are needed only when 'SOFTFLOAT_FAST_INT64' is not | defined. *----------------------------------------------------------------------------*/ #ifndef softfloat_compare96M /*---------------------------------------------------------------------------- | Compares the two 96-bit unsigned integers pointed to by 'aPtr' and 'bPtr'. | Returns -1 if the first integer (A) is less than the second (B); returns 0 | if the two integers are equal; and returns +1 if the first integer (A) | is greater than the second (B). (The result is thus the signum of A - B.) | Each of 'aPtr' and 'bPtr' points to an array of three 32-bit elements that | concatenate in the platform's normal endian order to form a 96-bit integer. *----------------------------------------------------------------------------*/ int_fast8_t softfloat_compare96M( const uint32_t *aPtr, const uint32_t *bPtr ); #endif #ifndef softfloat_compare128M /*---------------------------------------------------------------------------- | Compares the two 128-bit unsigned integers pointed to by 'aPtr' and 'bPtr'. | Returns -1 if the first integer (A) is less than the second (B); returns 0 | if the two integers are equal; and returns +1 if the first integer (A) | is greater than the second (B). (The result is thus the signum of A - B.) | Each of 'aPtr' and 'bPtr' points to an array of four 32-bit elements that | concatenate in the platform's normal endian order to form a 128-bit integer. *----------------------------------------------------------------------------*/ int_fast8_t softfloat_compare128M( const uint32_t *aPtr, const uint32_t *bPtr ); #endif #ifndef softfloat_shortShiftLeft64To96M /*---------------------------------------------------------------------------- | Extends 'a' to 96 bits and shifts the value left by the number of bits given | in 'dist', which must be in the range 1 to 31. The result is stored at the | location pointed to by 'zPtr'. Argument 'zPtr' points to an array of three | 32-bit elements that concatenate in the platform's normal endian order to | form a 96-bit integer. *----------------------------------------------------------------------------*/ #if defined INLINE_LEVEL && (2 <= INLINE_LEVEL) INLINE void softfloat_shortShiftLeft64To96M( uint64_t a, uint_fast8_t dist, uint32_t *zPtr ) { zPtr[indexWord( 3, 0 )] = (uint32_t) a<>= 32 - dist; zPtr[indexWord( 3, 2 )] = a>>32; zPtr[indexWord( 3, 1 )] = a; } #else void softfloat_shortShiftLeft64To96M( uint64_t a, uint_fast8_t dist, uint32_t *zPtr ); #endif #endif #ifndef softfloat_shortShiftLeftM /*---------------------------------------------------------------------------- | Shifts the N-bit unsigned integer pointed to by 'aPtr' left by the number | of bits given in 'dist', where N = 'size_words' * 32. The value of 'dist' | must be in the range 1 to 31. Any nonzero bits shifted off are lost. The | shifted N-bit result is stored at the location pointed to by 'zPtr'. Each | of 'aPtr' and 'zPtr' points to a 'size_words'-long array of 32-bit elements | that concatenate in the platform's normal endian order to form an N-bit | integer. *----------------------------------------------------------------------------*/ void softfloat_shortShiftLeftM( uint_fast8_t size_words, const uint32_t *aPtr, uint_fast8_t dist, uint32_t *zPtr ); #endif #ifndef softfloat_shortShiftLeft96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shortShiftLeftM' with | 'size_words' = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_shortShiftLeft96M( aPtr, dist, zPtr ) softfloat_shortShiftLeftM( 3, aPtr, dist, zPtr ) #endif #ifndef softfloat_shortShiftLeft128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shortShiftLeftM' with | 'size_words' = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_shortShiftLeft128M( aPtr, dist, zPtr ) softfloat_shortShiftLeftM( 4, aPtr, dist, zPtr ) #endif #ifndef softfloat_shortShiftLeft160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shortShiftLeftM' with | 'size_words' = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_shortShiftLeft160M( aPtr, dist, zPtr ) softfloat_shortShiftLeftM( 5, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftLeftM /*---------------------------------------------------------------------------- | Shifts the N-bit unsigned integer pointed to by 'aPtr' left by the number | of bits given in 'dist', where N = 'size_words' * 32. The value of 'dist' | must not be zero. Any nonzero bits shifted off are lost. The shifted | N-bit result is stored at the location pointed to by 'zPtr'. Each of 'aPtr' | and 'zPtr' points to a 'size_words'-long array of 32-bit elements that | concatenate in the platform's normal endian order to form an N-bit integer. | The value of 'dist' can be arbitrarily large. In particular, if 'dist' is | greater than N, the stored result will be 0. *----------------------------------------------------------------------------*/ void softfloat_shiftLeftM( uint_fast8_t size_words, const uint32_t *aPtr, uint32_t dist, uint32_t *zPtr ); #endif #ifndef softfloat_shiftLeft96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftLeftM' with | 'size_words' = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_shiftLeft96M( aPtr, dist, zPtr ) softfloat_shiftLeftM( 3, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftLeft128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftLeftM' with | 'size_words' = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_shiftLeft128M( aPtr, dist, zPtr ) softfloat_shiftLeftM( 4, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftLeft160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftLeftM' with | 'size_words' = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_shiftLeft160M( aPtr, dist, zPtr ) softfloat_shiftLeftM( 5, aPtr, dist, zPtr ) #endif #ifndef softfloat_shortShiftRightM /*---------------------------------------------------------------------------- | Shifts the N-bit unsigned integer pointed to by 'aPtr' right by the number | of bits given in 'dist', where N = 'size_words' * 32. The value of 'dist' | must be in the range 1 to 31. Any nonzero bits shifted off are lost. The | shifted N-bit result is stored at the location pointed to by 'zPtr'. Each | of 'aPtr' and 'zPtr' points to a 'size_words'-long array of 32-bit elements | that concatenate in the platform's normal endian order to form an N-bit | integer. *----------------------------------------------------------------------------*/ void softfloat_shortShiftRightM( uint_fast8_t size_words, const uint32_t *aPtr, uint_fast8_t dist, uint32_t *zPtr ); #endif #ifndef softfloat_shortShiftRight128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shortShiftRightM' with | 'size_words' = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_shortShiftRight128M( aPtr, dist, zPtr ) softfloat_shortShiftRightM( 4, aPtr, dist, zPtr ) #endif #ifndef softfloat_shortShiftRight160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shortShiftRightM' with | 'size_words' = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_shortShiftRight160M( aPtr, dist, zPtr ) softfloat_shortShiftRightM( 5, aPtr, dist, zPtr ) #endif #ifndef softfloat_shortShiftRightJamM /*---------------------------------------------------------------------------- | Shifts the N-bit unsigned integer pointed to by 'aPtr' right by the number | of bits given in 'dist', where N = 'size_words' * 32. The value of 'dist' | must be in the range 1 to 31. If any nonzero bits are shifted off, they are | "jammed" into the least-significant bit of the shifted value by setting the | least-significant bit to 1. This shifted-and-jammed N-bit result is stored | at the location pointed to by 'zPtr'. Each of 'aPtr' and 'zPtr' points | to a 'size_words'-long array of 32-bit elements that concatenate in the | platform's normal endian order to form an N-bit integer. *----------------------------------------------------------------------------*/ void softfloat_shortShiftRightJamM( uint_fast8_t, const uint32_t *, uint_fast8_t, uint32_t * ); #endif #ifndef softfloat_shortShiftRightJam160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shortShiftRightJamM' with | 'size_words' = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_shortShiftRightJam160M( aPtr, dist, zPtr ) softfloat_shortShiftRightJamM( 5, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftRightM /*---------------------------------------------------------------------------- | Shifts the N-bit unsigned integer pointed to by 'aPtr' right by the number | of bits given in 'dist', where N = 'size_words' * 32. The value of 'dist' | must not be zero. Any nonzero bits shifted off are lost. The shifted | N-bit result is stored at the location pointed to by 'zPtr'. Each of 'aPtr' | and 'zPtr' points to a 'size_words'-long array of 32-bit elements that | concatenate in the platform's normal endian order to form an N-bit integer. | The value of 'dist' can be arbitrarily large. In particular, if 'dist' is | greater than N, the stored result will be 0. *----------------------------------------------------------------------------*/ void softfloat_shiftRightM( uint_fast8_t size_words, const uint32_t *aPtr, uint32_t dist, uint32_t *zPtr ); #endif #ifndef softfloat_shiftRight96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftRightM' with | 'size_words' = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_shiftRight96M( aPtr, dist, zPtr ) softfloat_shiftRightM( 3, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftRightJamM /*---------------------------------------------------------------------------- | Shifts the N-bit unsigned integer pointed to by 'aPtr' right by the number | of bits given in 'dist', where N = 'size_words' * 32. The value of 'dist' | must not be zero. If any nonzero bits are shifted off, they are "jammed" | into the least-significant bit of the shifted value by setting the least- | significant bit to 1. This shifted-and-jammed N-bit result is stored | at the location pointed to by 'zPtr'. Each of 'aPtr' and 'zPtr' points | to a 'size_words'-long array of 32-bit elements that concatenate in the | platform's normal endian order to form an N-bit integer. | The value of 'dist' can be arbitrarily large. In particular, if 'dist' | is greater than N, the stored result will be either 0 or 1, depending on | whether the original N bits are all zeros. *----------------------------------------------------------------------------*/ void softfloat_shiftRightJamM( uint_fast8_t size_words, const uint32_t *aPtr, uint32_t dist, uint32_t *zPtr ); #endif #ifndef softfloat_shiftRightJam96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftRightJamM' with | 'size_words' = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_shiftRightJam96M( aPtr, dist, zPtr ) softfloat_shiftRightJamM( 3, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftRightJam128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftRightJamM' with | 'size_words' = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_shiftRightJam128M( aPtr, dist, zPtr ) softfloat_shiftRightJamM( 4, aPtr, dist, zPtr ) #endif #ifndef softfloat_shiftRightJam160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_shiftRightJamM' with | 'size_words' = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_shiftRightJam160M( aPtr, dist, zPtr ) softfloat_shiftRightJamM( 5, aPtr, dist, zPtr ) #endif #ifndef softfloat_addM /*---------------------------------------------------------------------------- | Adds the two N-bit integers pointed to by 'aPtr' and 'bPtr', where N = | 'size_words' * 32. The addition is modulo 2^N, so any carry out is lost. | The N-bit sum is stored at the location pointed to by 'zPtr'. Each of | 'aPtr', 'bPtr', and 'zPtr' points to a 'size_words'-long array of 32-bit | elements that concatenate in the platform's normal endian order to form an | N-bit integer. *----------------------------------------------------------------------------*/ void softfloat_addM( uint_fast8_t size_words, const uint32_t *aPtr, const uint32_t *bPtr, uint32_t *zPtr ); #endif #ifndef softfloat_add96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_addM' with 'size_words' | = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_add96M( aPtr, bPtr, zPtr ) softfloat_addM( 3, aPtr, bPtr, zPtr ) #endif #ifndef softfloat_add128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_addM' with 'size_words' | = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_add128M( aPtr, bPtr, zPtr ) softfloat_addM( 4, aPtr, bPtr, zPtr ) #endif #ifndef softfloat_add160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_addM' with 'size_words' | = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_add160M( aPtr, bPtr, zPtr ) softfloat_addM( 5, aPtr, bPtr, zPtr ) #endif #ifndef softfloat_addCarryM /*---------------------------------------------------------------------------- | Adds the two N-bit unsigned integers pointed to by 'aPtr' and 'bPtr', where | N = 'size_words' * 32, plus 'carry', which must be either 0 or 1. The N-bit | sum (modulo 2^N) is stored at the location pointed to by 'zPtr', and any | carry out is returned as the result. Each of 'aPtr', 'bPtr', and 'zPtr' | points to a 'size_words'-long array of 32-bit elements that concatenate in | the platform's normal endian order to form an N-bit integer. *----------------------------------------------------------------------------*/ uint_fast8_t softfloat_addCarryM( uint_fast8_t size_words, const uint32_t *aPtr, const uint32_t *bPtr, uint_fast8_t carry, uint32_t *zPtr ); #endif #ifndef softfloat_addComplCarryM /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_addCarryM', except that | the value of the unsigned integer pointed to by 'bPtr' is bit-wise completed | before the addition. *----------------------------------------------------------------------------*/ uint_fast8_t softfloat_addComplCarryM( uint_fast8_t size_words, const uint32_t *aPtr, const uint32_t *bPtr, uint_fast8_t carry, uint32_t *zPtr ); #endif #ifndef softfloat_addComplCarry96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_addComplCarryM' with | 'size_words' = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_addComplCarry96M( aPtr, bPtr, carry, zPtr ) softfloat_addComplCarryM( 3, aPtr, bPtr, carry, zPtr ) #endif #ifndef softfloat_negXM /*---------------------------------------------------------------------------- | Replaces the N-bit unsigned integer pointed to by 'zPtr' by the | 2s-complement of itself, where N = 'size_words' * 32. Argument 'zPtr' | points to a 'size_words'-long array of 32-bit elements that concatenate in | the platform's normal endian order to form an N-bit integer. *----------------------------------------------------------------------------*/ void softfloat_negXM( uint_fast8_t size_words, uint32_t *zPtr ); #endif #ifndef softfloat_negX96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_negXM' with 'size_words' | = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_negX96M( zPtr ) softfloat_negXM( 3, zPtr ) #endif #ifndef softfloat_negX128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_negXM' with 'size_words' | = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_negX128M( zPtr ) softfloat_negXM( 4, zPtr ) #endif #ifndef softfloat_negX160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_negXM' with 'size_words' | = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_negX160M( zPtr ) softfloat_negXM( 5, zPtr ) #endif #ifndef softfloat_negX256M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_negXM' with 'size_words' | = 8 (N = 256). *----------------------------------------------------------------------------*/ #define softfloat_negX256M( zPtr ) softfloat_negXM( 8, zPtr ) #endif #ifndef softfloat_sub1XM /*---------------------------------------------------------------------------- | Subtracts 1 from the N-bit integer pointed to by 'zPtr', where N = | 'size_words' * 32. The subtraction is modulo 2^N, so any borrow out (carry | out) is lost. Argument 'zPtr' points to a 'size_words'-long array of 32-bit | elements that concatenate in the platform's normal endian order to form an | N-bit integer. *----------------------------------------------------------------------------*/ void softfloat_sub1XM( uint_fast8_t size_words, uint32_t *zPtr ); #endif #ifndef softfloat_sub1X96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_sub1XM' with 'size_words' | = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_sub1X96M( zPtr ) softfloat_sub1XM( 3, zPtr ) #endif #ifndef softfloat_sub1X160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_sub1XM' with 'size_words' | = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_sub1X160M( zPtr ) softfloat_sub1XM( 5, zPtr ) #endif #ifndef softfloat_subM /*---------------------------------------------------------------------------- | Subtracts the two N-bit integers pointed to by 'aPtr' and 'bPtr', where N = | 'size_words' * 32. The subtraction is modulo 2^N, so any borrow out (carry | out) is lost. The N-bit difference is stored at the location pointed to by | 'zPtr'. Each of 'aPtr', 'bPtr', and 'zPtr' points to a 'size_words'-long | array of 32-bit elements that concatenate in the platform's normal endian | order to form an N-bit integer. *----------------------------------------------------------------------------*/ void softfloat_subM( uint_fast8_t size_words, const uint32_t *aPtr, const uint32_t *bPtr, uint32_t *zPtr ); #endif #ifndef softfloat_sub96M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_subM' with 'size_words' | = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_sub96M( aPtr, bPtr, zPtr ) softfloat_subM( 3, aPtr, bPtr, zPtr ) #endif #ifndef softfloat_sub128M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_subM' with 'size_words' | = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_sub128M( aPtr, bPtr, zPtr ) softfloat_subM( 4, aPtr, bPtr, zPtr ) #endif #ifndef softfloat_sub160M /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_subM' with 'size_words' | = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_sub160M( aPtr, bPtr, zPtr ) softfloat_subM( 5, aPtr, bPtr, zPtr ) #endif #ifndef softfloat_mul64To128M /*---------------------------------------------------------------------------- | Multiplies 'a' and 'b' and stores the 128-bit product at the location | pointed to by 'zPtr'. Argument 'zPtr' points to an array of four 32-bit | elements that concatenate in the platform's normal endian order to form a | 128-bit integer. *----------------------------------------------------------------------------*/ void softfloat_mul64To128M( uint64_t a, uint64_t b, uint32_t *zPtr ); #endif #ifndef softfloat_mul128MTo256M /*---------------------------------------------------------------------------- | Multiplies the two 128-bit unsigned integers pointed to by 'aPtr' and | 'bPtr', and stores the 256-bit product at the location pointed to by 'zPtr'. | Each of 'aPtr' and 'bPtr' points to an array of four 32-bit elements that | concatenate in the platform's normal endian order to form a 128-bit integer. | Argument 'zPtr' points to an array of eight 32-bit elements that concatenate | to form a 256-bit integer. *----------------------------------------------------------------------------*/ void softfloat_mul128MTo256M( const uint32_t *aPtr, const uint32_t *bPtr, uint32_t *zPtr ); #endif #ifndef softfloat_remStepMBy32 /*---------------------------------------------------------------------------- | Performs a "remainder reduction step" as follows: Arguments 'remPtr' and | 'bPtr' both point to N-bit unsigned integers, where N = 'size_words' * 32. | Defining R and B as the values of those integers, the expression (R<<'dist') | - B * q is computed modulo 2^N, and the N-bit result is stored at the | location pointed to by 'zPtr'. Each of 'remPtr', 'bPtr', and 'zPtr' points | to a 'size_words'-long array of 32-bit elements that concatenate in the | platform's normal endian order to form an N-bit integer. *----------------------------------------------------------------------------*/ void softfloat_remStepMBy32( uint_fast8_t size_words, const uint32_t *remPtr, uint_fast8_t dist, const uint32_t *bPtr, uint32_t q, uint32_t *zPtr ); #endif #ifndef softfloat_remStep96MBy32 /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_remStepMBy32' with | 'size_words' = 3 (N = 96). *----------------------------------------------------------------------------*/ #define softfloat_remStep96MBy32( remPtr, dist, bPtr, q, zPtr ) softfloat_remStepMBy32( 3, remPtr, dist, bPtr, q, zPtr ) #endif #ifndef softfloat_remStep128MBy32 /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_remStepMBy32' with | 'size_words' = 4 (N = 128). *----------------------------------------------------------------------------*/ #define softfloat_remStep128MBy32( remPtr, dist, bPtr, q, zPtr ) softfloat_remStepMBy32( 4, remPtr, dist, bPtr, q, zPtr ) #endif #ifndef softfloat_remStep160MBy32 /*---------------------------------------------------------------------------- | This function or macro is the same as 'softfloat_remStepMBy32' with | 'size_words' = 5 (N = 160). *----------------------------------------------------------------------------*/ #define softfloat_remStep160MBy32( remPtr, dist, bPtr, q, zPtr ) softfloat_remStepMBy32( 5, remPtr, dist, bPtr, q, zPtr ) #endif #endif #endif ================================================ FILE: External/SoftFloat-3e/src/s_add128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_add128 FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_add128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { struct uint128 z; z.v0 = a0 + b0; z.v64 = a64 + b64 + (z.v0 < a0); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_addMagsExtF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" extFloat80_t softfloat_addMagsExtF80( struct softfloat_state *state, uint_fast16_t uiA64, uint_fast64_t uiA0, uint_fast16_t uiB64, uint_fast64_t uiB0, bool signZ ) { int_fast32_t expA; uint_fast64_t sigA; int_fast32_t expB; uint_fast64_t sigB; int_fast32_t expDiff; uint_fast16_t uiZ64; uint_fast64_t uiZ0, sigZ, sigZExtra; struct exp32_sig64 normExpSig; int_fast32_t expZ; struct uint64_extra sig64Extra; struct uint128 uiZ; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expA = expExtF80UI64( uiA64 ); sigA = uiA0; expB = expExtF80UI64( uiB64 ); sigB = uiB0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expDiff = expA - expB; if ( ! expDiff ) { if ( expA == 0x7FFF ) { if ( (sigA | sigB) & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) { goto propagateNaN; } uiZ64 = uiA64; uiZ0 = uiA0; goto uiZ; } sigZ = sigA + sigB; sigZExtra = 0; if ( ! expA ) { normExpSig = softfloat_normSubnormalExtF80Sig( sigZ ); expZ = normExpSig.exp + 1; sigZ = normExpSig.sig; goto roundAndPack; } expZ = expA; goto shiftRight1; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( expDiff < 0 ) { if ( expB == 0x7FFF ) { if ( sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; uiZ64 = packToExtF80UI64( signZ, 0x7FFF ); uiZ0 = uiB0; goto uiZ; } expZ = expB; if ( ! expA ) { ++expDiff; sigZExtra = 0; if ( ! expDiff ) goto newlyAligned; } sig64Extra = softfloat_shiftRightJam64Extra( sigA, 0, -expDiff ); sigA = sig64Extra.v; sigZExtra = sig64Extra.extra; } else { if ( expA == 0x7FFF ) { if ( sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; uiZ64 = uiA64; uiZ0 = uiA0; goto uiZ; } expZ = expA; if ( ! expB ) { --expDiff; sigZExtra = 0; if ( ! expDiff ) goto newlyAligned; } sig64Extra = softfloat_shiftRightJam64Extra( sigB, 0, expDiff ); sigB = sig64Extra.v; sigZExtra = sig64Extra.extra; } newlyAligned: sigZ = sigA + sigB; if ( sigZ & UINT64_C( 0x8000000000000000 ) ) goto roundAndPack; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ shiftRight1: sig64Extra = softfloat_shortShiftRightJam64Extra( sigZ, sigZExtra, 1 ); sigZ = sig64Extra.v | UINT64_C( 0x8000000000000000 ); sigZExtra = sig64Extra.extra; ++expZ; roundAndPack: return softfloat_roundPackToExtF80( state, signZ, expZ, sigZ, sigZExtra, state->roundingPrecision ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_addMagsF128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" float128_t softfloat_addMagsF128( struct softfloat_state *state, uint_fast64_t uiA64, uint_fast64_t uiA0, uint_fast64_t uiB64, uint_fast64_t uiB0, bool signZ ) { int_fast32_t expA; struct uint128 sigA; int_fast32_t expB; struct uint128 sigB; int_fast32_t expDiff; struct uint128 uiZ, sigZ; int_fast32_t expZ; uint_fast64_t sigZExtra; struct uint128_extra sig128Extra; union ui128_f128 uZ; expA = expF128UI64( uiA64 ); sigA.v64 = fracF128UI64( uiA64 ); sigA.v0 = uiA0; expB = expF128UI64( uiB64 ); sigB.v64 = fracF128UI64( uiB64 ); sigB.v0 = uiB0; expDiff = expA - expB; if ( ! expDiff ) { if ( expA == 0x7FFF ) { if ( sigA.v64 | sigA.v0 | sigB.v64 | sigB.v0 ) goto propagateNaN; uiZ.v64 = uiA64; uiZ.v0 = uiA0; goto uiZ; } sigZ = softfloat_add128( sigA.v64, sigA.v0, sigB.v64, sigB.v0 ); if ( ! expA ) { uiZ.v64 = packToF128UI64( signZ, 0, sigZ.v64 ); uiZ.v0 = sigZ.v0; goto uiZ; } expZ = expA; sigZ.v64 |= UINT64_C( 0x0002000000000000 ); sigZExtra = 0; goto shiftRight1; } if ( expDiff < 0 ) { if ( expB == 0x7FFF ) { if ( sigB.v64 | sigB.v0 ) goto propagateNaN; uiZ.v64 = packToF128UI64( signZ, 0x7FFF, 0 ); uiZ.v0 = 0; goto uiZ; } expZ = expB; if ( expA ) { sigA.v64 |= UINT64_C( 0x0001000000000000 ); } else { ++expDiff; sigZExtra = 0; if ( ! expDiff ) goto newlyAligned; } sig128Extra = softfloat_shiftRightJam128Extra( sigA.v64, sigA.v0, 0, -expDiff ); sigA = sig128Extra.v; sigZExtra = sig128Extra.extra; } else { if ( expA == 0x7FFF ) { if ( sigA.v64 | sigA.v0 ) goto propagateNaN; uiZ.v64 = uiA64; uiZ.v0 = uiA0; goto uiZ; } expZ = expA; if ( expB ) { sigB.v64 |= UINT64_C( 0x0001000000000000 ); } else { --expDiff; sigZExtra = 0; if ( ! expDiff ) goto newlyAligned; } sig128Extra = softfloat_shiftRightJam128Extra( sigB.v64, sigB.v0, 0, expDiff ); sigB = sig128Extra.v; sigZExtra = sig128Extra.extra; } newlyAligned: sigZ = softfloat_add128( sigA.v64 | UINT64_C( 0x0001000000000000 ), sigA.v0, sigB.v64, sigB.v0 ); --expZ; if ( sigZ.v64 < UINT64_C( 0x0002000000000000 ) ) goto roundAndPack; ++expZ; shiftRight1: sig128Extra = softfloat_shortShiftRightJam128Extra( sigZ.v64, sigZ.v0, sigZExtra, 1 ); sigZ = sig128Extra.v; sigZExtra = sig128Extra.extra; roundAndPack: return softfloat_roundPackToF128( state, signZ, expZ, sigZ.v64, sigZ.v0, sigZExtra ); propagateNaN: uiZ = softfloat_propagateNaNF128UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_approxRecip32_1.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_approxRecip32_1 extern const uint16_t softfloat_approxRecip_1k0s[16]; extern const uint16_t softfloat_approxRecip_1k1s[16]; FEXCORE_PRESERVE_ALL_ATTR uint32_t softfloat_approxRecip32_1( uint32_t a ) { int index; uint16_t eps, r0; uint32_t sigma0; uint_fast32_t r; uint32_t sqrSigma0; index = a>>27 & 0xF; eps = (uint16_t) (a>>11); r0 = softfloat_approxRecip_1k0s[index] - ((softfloat_approxRecip_1k1s[index] * (uint_fast32_t) eps)>>20); sigma0 = ~(uint_fast32_t) ((r0 * (uint_fast64_t) a)>>7); r = ((uint_fast32_t) r0<<16) + ((r0 * (uint_fast64_t) sigma0)>>24); sqrSigma0 = ((uint_fast64_t) sigma0 * sigma0)>>32; r += ((uint32_t) r * (uint_fast64_t) sqrSigma0)>>48; return r; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_approxRecipSqrt32_1.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_approxRecipSqrt32_1 extern const uint16_t softfloat_approxRecipSqrt_1k0s[]; extern const uint16_t softfloat_approxRecipSqrt_1k1s[]; FEXCORE_PRESERVE_ALL_ATTR uint32_t softfloat_approxRecipSqrt32_1( unsigned int oddExpA, uint32_t a ) { int index; uint16_t eps, r0; uint_fast32_t ESqrR0; uint32_t sigma0; uint_fast32_t r; uint32_t sqrSigma0; index = (a>>27 & 0xE) + oddExpA; eps = (uint16_t) (a>>12); r0 = softfloat_approxRecipSqrt_1k0s[index] - ((softfloat_approxRecipSqrt_1k1s[index] * (uint_fast32_t) eps) >>20); ESqrR0 = (uint_fast32_t) r0 * r0; if ( ! oddExpA ) ESqrR0 <<= 1; sigma0 = ~(uint_fast32_t) (((uint32_t) ESqrR0 * (uint_fast64_t) a)>>23); r = ((uint_fast32_t) r0<<16) + ((r0 * (uint_fast64_t) sigma0)>>25); sqrSigma0 = ((uint_fast64_t) sigma0 * sigma0)>>32; r += ((uint32_t) ((r>>1) + (r>>3) - ((uint_fast32_t) r0<<14)) * (uint_fast64_t) sqrSigma0) >>48; if ( ! (r & 0x80000000) ) r = 0x80000000; return r; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_approxRecipSqrt_1Ks.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitives.h" const uint16_t softfloat_approxRecipSqrt_1k0s[16] = { 0xB4C9, 0xFFAB, 0xAA7D, 0xF11C, 0xA1C5, 0xE4C7, 0x9A43, 0xDA29, 0x93B5, 0xD0E5, 0x8DED, 0xC8B7, 0x88C6, 0xC16D, 0x8424, 0xBAE1 }; const uint16_t softfloat_approxRecipSqrt_1k1s[16] = { 0xA5A5, 0xEA42, 0x8C21, 0xC62D, 0x788F, 0xAA7F, 0x6928, 0x94B6, 0x5CC7, 0x8335, 0x52A6, 0x74E2, 0x4A3E, 0x68FE, 0x432B, 0x5EFD }; ================================================ FILE: External/SoftFloat-3e/src/s_approxRecip_1Ks.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitives.h" const uint16_t softfloat_approxRecip_1k0s[16] = { 0xFFC4, 0xF0BE, 0xE363, 0xD76F, 0xCCAD, 0xC2F0, 0xBA16, 0xB201, 0xAA97, 0xA3C6, 0x9D7A, 0x97A6, 0x923C, 0x8D32, 0x887E, 0x8417 }; const uint16_t softfloat_approxRecip_1k1s[16] = { 0xF0F1, 0xD62C, 0xBFA1, 0xAC77, 0x9C0A, 0x8DDB, 0x8185, 0x76BA, 0x6D3B, 0x64D4, 0x5D5C, 0x56B1, 0x50B6, 0x4B55, 0x4679, 0x4211 }; ================================================ FILE: External/SoftFloat-3e/src/s_commonNaNToExtF80UI.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitives.h" #include "specialize.h" /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by `aPtr' into an 80-bit extended | floating-point NaN, and returns the bit pattern of this value as an unsigned | integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_commonNaNToExtF80UI( const struct commonNaN *aPtr ) { struct uint128 uiZ; uiZ.v64 = (uint_fast16_t) aPtr->sign<<15 | 0x7FFF; uiZ.v0 = UINT64_C( 0xC000000000000000 ) | aPtr->v64>>1; return uiZ; } ================================================ FILE: External/SoftFloat-3e/src/s_commonNaNToF128UI.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitives.h" #include "specialize.h" /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by `aPtr' into a 128-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_commonNaNToF128UI( const struct commonNaN *aPtr ) { struct uint128 uiZ; uiZ = softfloat_shortShiftRight128( aPtr->v64, aPtr->v0, 16 ); uiZ.v64 |= (uint_fast64_t) aPtr->sign<<63 | UINT64_C( 0x7FFF800000000000 ); return uiZ; } ================================================ FILE: External/SoftFloat-3e/src/s_commonNaNToF32UI.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "specialize.h" /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by `aPtr' into a 32-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR uint_fast32_t softfloat_commonNaNToF32UI( const struct commonNaN *aPtr ) { return (uint_fast32_t) aPtr->sign<<31 | 0x7FC00000 | aPtr->v64>>41; } ================================================ FILE: External/SoftFloat-3e/src/s_commonNaNToF64UI.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "specialize.h" /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by `aPtr' into a 64-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR uint_fast64_t softfloat_commonNaNToF64UI( const struct commonNaN *aPtr ) { return (uint_fast64_t) aPtr->sign<<63 | UINT64_C( 0x7FF8000000000000 ) | aPtr->v64>>12; } ================================================ FILE: External/SoftFloat-3e/src/s_countLeadingZeros32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_countLeadingZeros32 #define softfloat_countLeadingZeros32 softfloat_countLeadingZeros32 #include "primitives.h" FEXCORE_PRESERVE_ALL_ATTR uint_fast8_t softfloat_countLeadingZeros32( uint32_t a ) { uint_fast8_t count; count = 0; if ( a < 0x10000 ) { count = 16; a <<= 16; } if ( a < 0x1000000 ) { count += 8; a <<= 8; } count += softfloat_countLeadingZeros8[a>>24]; return count; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_countLeadingZeros64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_countLeadingZeros64 #define softfloat_countLeadingZeros64 softfloat_countLeadingZeros64 #include "primitives.h" FEXCORE_PRESERVE_ALL_ATTR uint_fast8_t softfloat_countLeadingZeros64( uint64_t a ) { uint_fast8_t count; uint32_t a32; count = 0; a32 = a>>32; if ( ! a32 ) { count = 32; a32 = a; } /*------------------------------------------------------------------------ | From here, result is current count + count leading zeros of `a32'. *------------------------------------------------------------------------*/ if ( a32 < 0x10000 ) { count += 16; a32 <<= 16; } if ( a32 < 0x1000000 ) { count += 8; a32 <<= 8; } count += softfloat_countLeadingZeros8[a32>>24]; return count; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_countLeadingZeros8.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitives.h" const uint_least8_t softfloat_countLeadingZeros8[256] = { 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; ================================================ FILE: External/SoftFloat-3e/src/s_extF80UIToCommonNaN.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "specialize.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Assuming the unsigned integer formed from concatenating `uiA64' and `uiA0' | has the bit pattern of an 80-bit extended floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by `zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_extF80UIToCommonNaN( struct softfloat_state *state, uint_fast16_t uiA64, uint_fast64_t uiA0, struct commonNaN *zPtr ) { if ( softfloat_isSigNaNExtF80UI( uiA64, uiA0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } zPtr->sign = uiA64>>15; zPtr->v64 = uiA0<<1; zPtr->v0 = 0; } ================================================ FILE: External/SoftFloat-3e/src/s_f128UIToCommonNaN.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitives.h" #include "specialize.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Assuming the unsigned integer formed from concatenating `uiA64' and `uiA0' | has the bit pattern of a 128-bit floating-point NaN, converts this NaN to | the common NaN form, and stores the resulting common NaN at the location | pointed to by `zPtr'. If the NaN is a signaling NaN, the invalid exception | is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_f128UIToCommonNaN( struct softfloat_state *state, uint_fast64_t uiA64, uint_fast64_t uiA0, struct commonNaN *zPtr ) { struct uint128 NaNSig; if ( softfloat_isSigNaNF128UI( uiA64, uiA0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } NaNSig = softfloat_shortShiftLeft128( uiA64, uiA0, 16 ); zPtr->sign = uiA64>>63; zPtr->v64 = NaNSig.v64; zPtr->v0 = NaNSig.v0; } ================================================ FILE: External/SoftFloat-3e/src/s_f32UIToCommonNaN.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "specialize.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Assuming `uiA' has the bit pattern of a 32-bit floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by `zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_f32UIToCommonNaN( struct softfloat_state *state, uint_fast32_t uiA, struct commonNaN *zPtr ) { if ( softfloat_isSigNaNF32UI( uiA ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } zPtr->sign = uiA>>31; zPtr->v64 = (uint_fast64_t) uiA<<41; zPtr->v0 = 0; } ================================================ FILE: External/SoftFloat-3e/src/s_f64UIToCommonNaN.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "specialize.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Assuming `uiA' has the bit pattern of a 64-bit floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by `zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_f64UIToCommonNaN( struct softfloat_state *state, uint_fast64_t uiA, struct commonNaN *zPtr ) { if ( softfloat_isSigNaNF64UI( uiA ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); } zPtr->sign = uiA>>63; zPtr->v64 = uiA<<12; zPtr->v0 = 0; } ================================================ FILE: External/SoftFloat-3e/src/s_le128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #ifndef softfloat_le128 FEXCORE_PRESERVE_ALL_ATTR bool softfloat_le128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { return (a64 < b64) || ((a64 == b64) && (a0 <= b0)); } #endif ================================================ FILE: External/SoftFloat-3e/src/s_lt128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #ifndef softfloat_lt128 FEXCORE_PRESERVE_ALL_ATTR bool softfloat_lt128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { return (a64 < b64) || ((a64 == b64) && (a0 < b0)); } #endif ================================================ FILE: External/SoftFloat-3e/src/s_mul64ByShifted32To128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_mul64ByShifted32To128 FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_mul64ByShifted32To128( uint64_t a, uint32_t b ) { uint_fast64_t mid; struct uint128 z; mid = (uint_fast64_t) (uint32_t) a * b; z.v0 = mid<<32; z.v64 = (uint_fast64_t) (uint32_t) (a>>32) * b + (mid>>32); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_mul64To128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_mul64To128 FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ) { uint32_t a32, a0, b32, b0; struct uint128 z; uint64_t mid1, mid; a32 = a>>32; a0 = a; b32 = b>>32; b0 = b; z.v0 = (uint_fast64_t) a0 * b0; mid1 = (uint_fast64_t) a32 * b0; mid = mid1 + (uint_fast64_t) a0 * b32; z.v64 = (uint_fast64_t) a32 * b32; z.v64 += (uint_fast64_t) (mid < mid1)<<32 | mid>>32; mid <<= 32; z.v0 += mid; z.v64 += (z.v0 < mid); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_normRoundPackToExtF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t softfloat_normRoundPackToExtF80( struct softfloat_state *state, bool sign, int_fast32_t exp, uint_fast64_t sig, uint_fast64_t sigExtra, uint_fast8_t roundingPrecision ) { int_fast8_t shiftDist; struct uint128 sig128; if ( ! sig ) { exp -= 64; sig = sigExtra; sigExtra = 0; } shiftDist = softfloat_countLeadingZeros64( sig ); exp -= shiftDist; if ( shiftDist ) { sig128 = softfloat_shortShiftLeft128( sig, sigExtra, shiftDist ); sig = sig128.v64; sigExtra = sig128.v0; } return softfloat_roundPackToExtF80( state, sign, exp, sig, sigExtra, roundingPrecision ); } ================================================ FILE: External/SoftFloat-3e/src/s_normRoundPackToF128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" float128_t softfloat_normRoundPackToF128( struct softfloat_state *state, bool sign, int_fast32_t exp, uint_fast64_t sig64, uint_fast64_t sig0 ) { int_fast8_t shiftDist; struct uint128 sig128; union ui128_f128 uZ; uint_fast64_t sigExtra; struct uint128_extra sig128Extra; if ( ! sig64 ) { exp -= 64; sig64 = sig0; sig0 = 0; } shiftDist = softfloat_countLeadingZeros64( sig64 ) - 15; exp -= shiftDist; if ( 0 <= shiftDist ) { if ( shiftDist ) { sig128 = softfloat_shortShiftLeft128( sig64, sig0, shiftDist ); sig64 = sig128.v64; sig0 = sig128.v0; } if ( (uint32_t) exp < 0x7FFD ) { uZ.ui.v64 = packToF128UI64( sign, sig64 | sig0 ? exp : 0, sig64 ); uZ.ui.v0 = sig0; return uZ.f; } sigExtra = 0; } else { sig128Extra = softfloat_shortShiftRightJam128Extra( sig64, sig0, 0, -shiftDist ); sig64 = sig128Extra.v.v64; sig0 = sig128Extra.v.v0; sigExtra = sig128Extra.extra; } return softfloat_roundPackToF128( state, sign, exp, sig64, sig0, sigExtra ); } ================================================ FILE: External/SoftFloat-3e/src/s_normSubnormalExtF80Sig.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "internals.h" FEXCORE_PRESERVE_ALL_ATTR struct exp32_sig64 softfloat_normSubnormalExtF80Sig( uint_fast64_t sig ) { int_fast8_t shiftDist; struct exp32_sig64 z; shiftDist = softfloat_countLeadingZeros64( sig ); z.exp = -shiftDist; z.sig = sig< #include "platform.h" #include "internals.h" FEXCORE_PRESERVE_ALL_ATTR struct exp32_sig128 softfloat_normSubnormalF128Sig( uint_fast64_t sig64, uint_fast64_t sig0 ) { int_fast8_t shiftDist; struct exp32_sig128 z; if ( ! sig64 ) { shiftDist = softfloat_countLeadingZeros64( sig0 ) - 15; z.exp = -63 - shiftDist; if ( shiftDist < 0 ) { z.sig.v64 = sig0>>-shiftDist; z.sig.v0 = sig0<<(shiftDist & 63); } else { z.sig.v64 = sig0< #include "platform.h" #include "internals.h" int softfloat_normSubnormalF128SigM( uint32_t *sigPtr ) { const uint32_t *ptr; int_fast16_t shiftDist; uint32_t wordSig; ptr = sigPtr + indexWordHi( 4 ); shiftDist = 0; for (;;) { wordSig = *ptr; if ( wordSig ) break; shiftDist += 32; if ( 128 <= shiftDist ) return 1; ptr -= wordIncr; } shiftDist += softfloat_countLeadingZeros32( wordSig ) - 15; if ( shiftDist ) softfloat_shiftLeft128M( sigPtr, shiftDist, sigPtr ); return 1 - shiftDist; } ================================================ FILE: External/SoftFloat-3e/src/s_normSubnormalF32Sig.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "internals.h" FEXCORE_PRESERVE_ALL_ATTR struct exp16_sig32 softfloat_normSubnormalF32Sig( uint_fast32_t sig ) { int_fast8_t shiftDist; struct exp16_sig32 z; shiftDist = softfloat_countLeadingZeros32( sig ) - 8; z.exp = 1 - shiftDist; z.sig = sig< #include "platform.h" #include "internals.h" FEXCORE_PRESERVE_ALL_ATTR struct exp16_sig64 softfloat_normSubnormalF64Sig( uint_fast64_t sig ) { int_fast8_t shiftDist; struct exp16_sig64 z; shiftDist = softfloat_countLeadingZeros64( sig ) - 11; z.exp = 1 - shiftDist; z.sig = sig< #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Interpreting the unsigned integer formed from concatenating 'uiA64' and | 'uiA0' as an 80-bit extended floating-point value, and likewise interpreting | the unsigned integer formed from concatenating 'uiB64' and 'uiB0' as another | 80-bit extended floating-point value, and assuming at least on of these | floating-point values is a NaN, returns the bit pattern of the combined NaN | result. If either original floating-point value is a signaling NaN, the | invalid exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_propagateNaNExtF80UI( struct softfloat_state *state, uint_fast16_t uiA64, uint_fast64_t uiA0, uint_fast16_t uiB64, uint_fast64_t uiB0 ) { bool isSigNaNA, isSigNaNB; uint_fast64_t uiNonsigA0, uiNonsigB0; uint_fast16_t uiMagA64, uiMagB64; struct uint128 uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ isSigNaNA = softfloat_isSigNaNExtF80UI( uiA64, uiA0 ); isSigNaNB = softfloat_isSigNaNExtF80UI( uiB64, uiB0 ); /*------------------------------------------------------------------------ | Make NaNs non-signaling. *------------------------------------------------------------------------*/ uiNonsigA0 = uiA0 | UINT64_C( 0xC000000000000000 ); uiNonsigB0 = uiB0 | UINT64_C( 0xC000000000000000 ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( isSigNaNA | isSigNaNB ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); if ( isSigNaNA ) { if ( isSigNaNB ) goto returnLargerMag; if ( isNaNExtF80UI( uiB64, uiB0 ) ) goto returnB; goto returnA; } else { if ( isNaNExtF80UI( uiA64, uiA0 ) ) goto returnA; goto returnB; } } returnLargerMag: uiMagA64 = uiA64 & 0x7FFF; uiMagB64 = uiB64 & 0x7FFF; if ( uiMagA64 < uiMagB64 ) goto returnB; if ( uiMagB64 < uiMagA64 ) goto returnA; if ( uiA0 < uiB0 ) goto returnB; if ( uiB0 < uiA0 ) goto returnA; if ( uiA64 < uiB64 ) goto returnA; returnB: uiZ.v64 = uiB64; uiZ.v0 = uiNonsigB0; return uiZ; returnA: uiZ.v64 = uiA64; uiZ.v0 = uiNonsigA0; return uiZ; } ================================================ FILE: External/SoftFloat-3e/src/s_propagateNaNF128UI.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Interpreting the unsigned integer formed from concatenating `uiA64' and | `uiA0' as a 128-bit floating-point value, and likewise interpreting the | unsigned integer formed from concatenating `uiB64' and `uiB0' as another | 128-bit floating-point value, and assuming at least on of these floating- | point values is a NaN, returns the bit pattern of the combined NaN result. | If either original floating-point value is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ struct uint128 softfloat_propagateNaNF128UI( struct softfloat_state *state, uint_fast64_t uiA64, uint_fast64_t uiA0, uint_fast64_t uiB64, uint_fast64_t uiB0 ) { bool isSigNaNA; struct uint128 uiZ; isSigNaNA = softfloat_isSigNaNF128UI( uiA64, uiA0 ); if ( isSigNaNA || softfloat_isSigNaNF128UI( uiB64, uiB0 ) ) { softfloat_raiseFlags( state, softfloat_flag_invalid ); if ( isSigNaNA ) goto returnNonsigA; } if ( isNaNF128UI( uiA64, uiA0 ) ) { returnNonsigA: uiZ.v64 = uiA64; uiZ.v0 = uiA0; } else { uiZ.v64 = uiB64; uiZ.v0 = uiB0; } uiZ.v64 |= UINT64_C( 0x0000800000000000 ); return uiZ; } ================================================ FILE: External/SoftFloat-3e/src/s_roundPackToExtF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t softfloat_roundPackToExtF80( struct softfloat_state *state, bool sign, int_fast32_t exp, uint_fast64_t sig, uint_fast64_t sigExtra, uint_fast8_t roundingPrecision ) { uint_fast8_t roundingMode; bool roundNearEven; uint_fast64_t roundIncrement, roundMask, roundBits; bool isTiny, doIncrement; struct uint64_extra sig64Extra; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ roundingMode = state->roundingMode; roundNearEven = (roundingMode == softfloat_round_near_even); if ( roundingPrecision == 80 ) goto precision80; if ( roundingPrecision == 64 ) { roundIncrement = UINT64_C( 0x0000000000000400 ); roundMask = UINT64_C( 0x00000000000007FF ); } else if ( roundingPrecision == 32 ) { roundIncrement = UINT64_C( 0x0000008000000000 ); roundMask = UINT64_C( 0x000000FFFFFFFFFF ); } else { goto precision80; } sig |= (sigExtra != 0); if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { roundIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) ? roundMask : 0; } roundBits = sig & roundMask; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( 0x7FFD <= (uint32_t) (exp - 1) ) { if ( exp <= 0 ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ isTiny = (state->detectTininess == softfloat_tininess_beforeRounding) || (exp < 0) || (sig <= (uint64_t) (sig + roundIncrement)); sig = softfloat_shiftRightJam64( sig, 1 - exp ); roundBits = sig & roundMask; if ( roundBits ) { if ( isTiny ) softfloat_raiseFlags( state, softfloat_flag_underflow ); state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig |= roundMask + 1; } #endif } sig += roundIncrement; exp = ((sig & UINT64_C( 0x8000000000000000 )) != 0); roundIncrement = roundMask + 1; if ( roundNearEven && (roundBits<<1 == roundIncrement) ) { roundMask |= roundIncrement; } sig &= ~roundMask; goto packReturn; } if ( (0x7FFE < exp) || ((exp == 0x7FFE) && ((uint64_t) (sig + roundIncrement) < sig)) ) { goto overflow; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( roundBits ) { state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig = (sig & ~roundMask) | (roundMask + 1); goto packReturn; } #endif } sig = (uint64_t) (sig + roundIncrement); if ( sig < roundIncrement ) { ++exp; sig = UINT64_C( 0x8000000000000000 ); } roundIncrement = roundMask + 1; if ( roundNearEven && (roundBits<<1 == roundIncrement) ) { roundMask |= roundIncrement; } sig &= ~roundMask; goto packReturn; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ precision80: doIncrement = (UINT64_C( 0x8000000000000000 ) <= sigExtra); if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { doIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) && sigExtra; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( 0x7FFD <= (uint32_t) (exp - 1) ) { if ( exp <= 0 ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ isTiny = (state->detectTininess == softfloat_tininess_beforeRounding) || (exp < 0) || ! doIncrement || (sig < UINT64_C( 0xFFFFFFFFFFFFFFFF )); sig64Extra = softfloat_shiftRightJam64Extra( sig, sigExtra, 1 - exp ); exp = 0; sig = sig64Extra.v; sigExtra = sig64Extra.extra; if ( sigExtra ) { if ( isTiny ) softfloat_raiseFlags( state, softfloat_flag_underflow ); state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig |= 1; goto packReturn; } #endif } doIncrement = (UINT64_C( 0x8000000000000000 ) <= sigExtra); if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { doIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) && sigExtra; } if ( doIncrement ) { ++sig; sig &= ~(uint_fast64_t) (! (sigExtra & UINT64_C( 0x7FFFFFFFFFFFFFFF )) & roundNearEven); exp = ((sig & UINT64_C( 0x8000000000000000 )) != 0); } goto packReturn; } if ( (0x7FFE < exp) || ((exp == 0x7FFE) && (sig == UINT64_C( 0xFFFFFFFFFFFFFFFF )) && doIncrement) ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ roundMask = 0; overflow: softfloat_raiseFlags( state, softfloat_flag_overflow | softfloat_flag_inexact ); if ( roundNearEven || (roundingMode == softfloat_round_near_maxMag) || (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) ) { exp = 0x7FFF; sig = UINT64_C( 0x8000000000000000 ); } else { exp = 0x7FFE; sig = ~roundMask; } goto packReturn; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( sigExtra ) { state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig |= 1; goto packReturn; } #endif } if ( doIncrement ) { ++sig; if ( ! sig ) { ++exp; sig = UINT64_C( 0x8000000000000000 ); } else { sig &= ~(uint_fast64_t) (! (sigExtra & UINT64_C( 0x7FFFFFFFFFFFFFFF )) & roundNearEven); } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ packReturn: uZ.s.signExp = packToExtF80UI64( sign, exp ); uZ.s.signif = sig; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_roundPackToF128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" float128_t softfloat_roundPackToF128( struct softfloat_state *state, bool sign, int_fast32_t exp, uint_fast64_t sig64, uint_fast64_t sig0, uint_fast64_t sigExtra ) { uint_fast8_t roundingMode; bool roundNearEven, doIncrement, isTiny; struct uint128_extra sig128Extra; uint_fast64_t uiZ64, uiZ0; struct uint128 sig128; union ui128_f128 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ roundingMode = state->roundingMode; roundNearEven = (roundingMode == softfloat_round_near_even); doIncrement = (UINT64_C( 0x8000000000000000 ) <= sigExtra); if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { doIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) && sigExtra; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( 0x7FFD <= (uint32_t) exp ) { if ( exp < 0 ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ isTiny = (state->detectTininess == softfloat_tininess_beforeRounding) || (exp < -1) || ! doIncrement || softfloat_lt128( sig64, sig0, UINT64_C( 0x0001FFFFFFFFFFFF ), UINT64_C( 0xFFFFFFFFFFFFFFFF ) ); sig128Extra = softfloat_shiftRightJam128Extra( sig64, sig0, sigExtra, -exp ); sig64 = sig128Extra.v.v64; sig0 = sig128Extra.v.v0; sigExtra = sig128Extra.extra; exp = 0; if ( isTiny && sigExtra ) { softfloat_raiseFlags( state, softfloat_flag_underflow ); } doIncrement = (UINT64_C( 0x8000000000000000 ) <= sigExtra); if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { doIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) && sigExtra; } } else if ( (0x7FFD < exp) || ((exp == 0x7FFD) && softfloat_eq128( sig64, sig0, UINT64_C( 0x0001FFFFFFFFFFFF ), UINT64_C( 0xFFFFFFFFFFFFFFFF ) ) && doIncrement) ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ softfloat_raiseFlags( state, softfloat_flag_overflow | softfloat_flag_inexact ); if ( roundNearEven || (roundingMode == softfloat_round_near_maxMag) || (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) ) { uiZ64 = packToF128UI64( sign, 0x7FFF, 0 ); uiZ0 = 0; } else { uiZ64 = packToF128UI64( sign, 0x7FFE, UINT64_C( 0x0000FFFFFFFFFFFF ) ); uiZ0 = UINT64_C( 0xFFFFFFFFFFFFFFFF ); } goto uiZ; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( sigExtra ) { state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig0 |= 1; goto packReturn; } #endif } if ( doIncrement ) { sig128 = softfloat_add128( sig64, sig0, 0, 1 ); sig64 = sig128.v64; sig0 = sig128.v0 & ~(uint64_t) (! (sigExtra & UINT64_C( 0x7FFFFFFFFFFFFFFF )) & roundNearEven); } else { if ( ! (sig64 | sig0) ) exp = 0; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiZ64 = packToF128UI64( sign, exp, sig64 ); uiZ0 = sig0; uiZ: uZ.ui.v64 = uiZ64; uZ.ui.v0 = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_roundPackToF32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR float32_t softfloat_roundPackToF32( struct softfloat_state *state, bool sign, int_fast16_t exp, uint_fast32_t sig ) { uint_fast8_t roundingMode; bool roundNearEven; uint_fast8_t roundIncrement, roundBits; bool isTiny; uint_fast32_t uiZ; union ui32_f32 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ roundingMode = state->roundingMode; roundNearEven = (roundingMode == softfloat_round_near_even); roundIncrement = 0x40; if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { roundIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) ? 0x7F : 0; } roundBits = sig & 0x7F; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( 0xFD <= (unsigned int) exp ) { if ( exp < 0 ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ isTiny = (state->detectTininess == softfloat_tininess_beforeRounding) || (exp < -1) || (sig + roundIncrement < 0x80000000); sig = softfloat_shiftRightJam32( sig, -exp ); exp = 0; roundBits = sig & 0x7F; if ( isTiny && roundBits ) { softfloat_raiseFlags( state, softfloat_flag_underflow ); } } else if ( (0xFD < exp) || (0x80000000 <= sig + roundIncrement) ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ softfloat_raiseFlags( state, softfloat_flag_overflow | softfloat_flag_inexact ); uiZ = packToF32UI( sign, 0xFF, 0 ) - ! roundIncrement; goto uiZ; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sig = (sig + roundIncrement)>>7; if ( roundBits ) { state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig |= 1; goto packReturn; } #endif } sig &= ~(uint_fast32_t) (! (roundBits ^ 0x40) & roundNearEven); if ( ! sig ) exp = 0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ #ifdef SOFTFLOAT_ROUND_ODD packReturn: #endif uiZ = packToF32UI( sign, exp, sig ); uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_roundPackToF64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR float64_t softfloat_roundPackToF64( struct softfloat_state *state, bool sign, int_fast16_t exp, uint_fast64_t sig ) { uint_fast8_t roundingMode; bool roundNearEven; uint_fast16_t roundIncrement, roundBits; bool isTiny; uint_fast64_t uiZ; union ui64_f64 uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ roundingMode = state->roundingMode; roundNearEven = (roundingMode == softfloat_round_near_even); roundIncrement = 0x200; if ( ! roundNearEven && (roundingMode != softfloat_round_near_maxMag) ) { roundIncrement = (roundingMode == (sign ? softfloat_round_min : softfloat_round_max)) ? 0x3FF : 0; } roundBits = sig & 0x3FF; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( 0x7FD <= (uint16_t) exp ) { if ( exp < 0 ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ isTiny = (state->detectTininess == softfloat_tininess_beforeRounding) || (exp < -1) || (sig + roundIncrement < UINT64_C( 0x8000000000000000 )); sig = softfloat_shiftRightJam64( sig, -exp ); exp = 0; roundBits = sig & 0x3FF; if ( isTiny && roundBits ) { softfloat_raiseFlags( state, softfloat_flag_underflow ); } } else if ( (0x7FD < exp) || (UINT64_C( 0x8000000000000000 ) <= sig + roundIncrement) ) { /*---------------------------------------------------------------- *----------------------------------------------------------------*/ softfloat_raiseFlags( state, softfloat_flag_overflow | softfloat_flag_inexact ); uiZ = packToF64UI( sign, 0x7FF, 0 ) - ! roundIncrement; goto uiZ; } } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ sig = (sig + roundIncrement)>>10; if ( roundBits ) { state->exceptionFlags |= softfloat_flag_inexact; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) { sig |= 1; goto packReturn; } #endif } sig &= ~(uint_fast64_t) (! (roundBits ^ 0x200) & roundNearEven); if ( ! sig ) exp = 0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ #ifdef SOFTFLOAT_ROUND_ODD packReturn: #endif uiZ = packToF64UI( sign, exp, sig ); uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_roundToI32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR int_fast32_t softfloat_roundToI32( struct softfloat_state *state, bool sign, uint_fast64_t sig, uint_fast8_t roundingMode, bool exact ) { uint_fast16_t roundIncrement, roundBits; uint_fast32_t sig32; union { uint32_t ui; int32_t i; } uZ; int_fast32_t z; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ roundIncrement = 0x800; if ( (roundingMode != softfloat_round_near_maxMag) && (roundingMode != softfloat_round_near_even) ) { roundIncrement = 0; if ( sign ? (roundingMode == softfloat_round_min) #ifdef SOFTFLOAT_ROUND_ODD || (roundingMode == softfloat_round_odd) #endif : (roundingMode == softfloat_round_max) ) { roundIncrement = 0xFFF; } } roundBits = sig & 0xFFF; sig += roundIncrement; if ( sig & UINT64_C( 0xFFFFF00000000000 ) ) goto invalid; sig32 = sig>>12; if ( (roundBits == 0x800) && (roundingMode == softfloat_round_near_even) ) { sig32 &= ~(uint_fast32_t) 1; } uZ.ui = sign ? -sig32 : sig32; z = uZ.i; if ( z && ((z < 0) ^ sign) ) goto invalid; if ( roundBits ) { #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) z |= 1; #endif if ( exact ) state->exceptionFlags |= softfloat_flag_inexact; } return z; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); return sign ? i32_fromNegOverflow : i32_fromPosOverflow; } ================================================ FILE: External/SoftFloat-3e/src/s_roundToI64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR int_fast64_t softfloat_roundToI64( struct softfloat_state *state, bool sign, uint_fast64_t sig, uint_fast64_t sigExtra, uint_fast8_t roundingMode, bool exact ) { union { uint64_t ui; int64_t i; } uZ; int_fast64_t z; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( (roundingMode == softfloat_round_near_maxMag) || (roundingMode == softfloat_round_near_even) ) { if ( UINT64_C( 0x8000000000000000 ) <= sigExtra ) goto increment; } else { if ( sigExtra && (sign ? (roundingMode == softfloat_round_min) #ifdef SOFTFLOAT_ROUND_ODD || (roundingMode == softfloat_round_odd) #endif : (roundingMode == softfloat_round_max)) ) { increment: ++sig; if ( !sig ) goto invalid; if ( (sigExtra == UINT64_C( 0x8000000000000000 )) && (roundingMode == softfloat_round_near_even) ) { sig &= ~(uint_fast64_t) 1; } } } uZ.ui = sign ? -sig : sig; z = uZ.i; if ( z && ((z < 0) ^ sign) ) goto invalid; if ( sigExtra ) { #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) z |= 1; #endif if ( exact ) state->exceptionFlags |= softfloat_flag_inexact; } return z; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); return sign ? i64_fromNegOverflow : i64_fromPosOverflow; } ================================================ FILE: External/SoftFloat-3e/src/s_roundToUI64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" uint_fast64_t softfloat_roundToUI64( struct softfloat_state *state, bool sign, uint_fast64_t sig, uint_fast64_t sigExtra, uint_fast8_t roundingMode, bool exact ) { /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ if ( (roundingMode == softfloat_round_near_maxMag) || (roundingMode == softfloat_round_near_even) ) { if ( UINT64_C( 0x8000000000000000 ) <= sigExtra ) goto increment; } else { if ( sign ) { if ( !(sig | sigExtra) ) return 0; if ( roundingMode == softfloat_round_min ) goto invalid; #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) goto invalid; #endif } else { if ( (roundingMode == softfloat_round_max) && sigExtra ) { increment: ++sig; if ( !sig ) goto invalid; if ( (sigExtra == UINT64_C( 0x8000000000000000 )) && (roundingMode == softfloat_round_near_even) ) { sig &= ~(uint_fast64_t) 1; } } } } if ( sign && sig ) goto invalid; if ( sigExtra ) { #ifdef SOFTFLOAT_ROUND_ODD if ( roundingMode == softfloat_round_odd ) sig |= 1; #endif if ( exact ) state->exceptionFlags |= softfloat_flag_inexact; } return sig; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ invalid: softfloat_raiseFlags( state, softfloat_flag_invalid ); return sign ? ui64_fromNegOverflow : ui64_fromPosOverflow; } ================================================ FILE: External/SoftFloat-3e/src/s_shiftRightJam128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_shiftRightJam128 struct uint128 softfloat_shiftRightJam128( uint64_t a64, uint64_t a0, uint_fast32_t dist ) { uint_fast8_t u8NegDist; struct uint128 z; if ( dist < 64 ) { u8NegDist = -dist; z.v64 = a64>>dist; z.v0 = a64<<(u8NegDist & 63) | a0>>dist | ((uint64_t) (a0<<(u8NegDist & 63)) != 0); } else { z.v64 = 0; z.v0 = (dist < 127) ? a64>>(dist & 63) | (((a64 & (((uint_fast64_t) 1<<(dist & 63)) - 1)) | a0) != 0) : ((a64 | a0) != 0); } return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_shiftRightJam128Extra.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_shiftRightJam128Extra struct uint128_extra softfloat_shiftRightJam128Extra( uint64_t a64, uint64_t a0, uint64_t extra, uint_fast32_t dist ) { uint_fast8_t u8NegDist; struct uint128_extra z; u8NegDist = -dist; if ( dist < 64 ) { z.v.v64 = a64>>dist; z.v.v0 = a64<<(u8NegDist & 63) | a0>>dist; z.extra = a0<<(u8NegDist & 63); } else { z.v.v64 = 0; if ( dist == 64 ) { z.v.v0 = a64; z.extra = a0; } else { extra |= a0; if ( dist < 128 ) { z.v.v0 = a64>>(dist & 63); z.extra = a64<<(u8NegDist & 63); } else { z.v.v0 = 0; z.extra = (dist == 128) ? a64 : (a64 != 0); } } } z.extra |= (extra != 0); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_shiftRightJam32.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_shiftRightJam32 FEXCORE_PRESERVE_ALL_ATTR uint32_t softfloat_shiftRightJam32( uint32_t a, uint_fast16_t dist ) { return (dist < 31) ? a>>dist | ((uint32_t) (a<<(-dist & 31)) != 0) : (a != 0); } #endif ================================================ FILE: External/SoftFloat-3e/src/s_shiftRightJam64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_shiftRightJam64 FEXCORE_PRESERVE_ALL_ATTR uint64_t softfloat_shiftRightJam64( uint64_t a, uint_fast32_t dist ) { return (dist < 63) ? a>>dist | ((uint64_t) (a<<(-dist & 63)) != 0) : (a != 0); } #endif ================================================ FILE: External/SoftFloat-3e/src/s_shiftRightJam64Extra.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_shiftRightJam64Extra FEXCORE_PRESERVE_ALL_ATTR struct uint64_extra softfloat_shiftRightJam64Extra( uint64_t a, uint64_t extra, uint_fast32_t dist ) { struct uint64_extra z; if ( dist < 64 ) { z.v = a>>dist; z.extra = a<<(-dist & 63); } else { z.v = 0; z.extra = (dist == 64) ? a : (a != 0); } z.extra |= (extra != 0); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_shortShiftLeft128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_shortShiftLeft128 FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_shortShiftLeft128( uint64_t a64, uint64_t a0, uint_fast8_t dist ) { struct uint128 z; z.v64 = a64<>(-dist & 63); z.v0 = a0< #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_shortShiftRight128 FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_shortShiftRight128( uint64_t a64, uint64_t a0, uint_fast8_t dist ) { struct uint128 z; z.v64 = a64>>dist; z.v0 = a64<<(-dist & 63) | a0>>dist; return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_shortShiftRightJam64.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #ifndef softfloat_shortShiftRightJam64 FEXCORE_PRESERVE_ALL_ATTR uint64_t softfloat_shortShiftRightJam64( uint64_t a, uint_fast8_t dist ) { return a>>dist | ((a & (((uint_fast64_t) 1< #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_shortShiftRightJam64Extra struct uint64_extra softfloat_shortShiftRightJam64Extra( uint64_t a, uint64_t extra, uint_fast8_t dist ) { struct uint64_extra z; z.v = a>>dist; z.extra = a<<(-dist & 63) | (extra != 0); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_sub128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "primitiveTypes.h" #ifndef softfloat_sub128 FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_sub128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { struct uint128 z; z.v0 = a0 - b0; z.v64 = a64 - b64 - (a0 < b0); return z; } #endif ================================================ FILE: External/SoftFloat-3e/src/s_subMagsExtF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" extFloat80_t softfloat_subMagsExtF80( struct softfloat_state *state, uint_fast16_t uiA64, uint_fast64_t uiA0, uint_fast16_t uiB64, uint_fast64_t uiB0, bool signZ ) { int_fast32_t expA; uint_fast64_t sigA; int_fast32_t expB; uint_fast64_t sigB; int_fast32_t expDiff; uint_fast16_t uiZ64; uint_fast64_t uiZ0; int_fast32_t expZ; uint_fast64_t sigExtra; struct uint128 sig128, uiZ; union { struct extFloat80M s; extFloat80_t f; } uZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expA = expExtF80UI64( uiA64 ); sigA = uiA0; expB = expExtF80UI64( uiB64 ); sigB = uiB0; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expDiff = expA - expB; if ( 0 < expDiff ) goto expABigger; if ( expDiff < 0 ) goto expBBigger; if ( expA == 0x7FFF ) { if ( (sigA | sigB) & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) { goto propagateNaN; } softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ64 = defaultNaNExtF80UI64; uiZ0 = defaultNaNExtF80UI0; goto uiZ; } /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expZ = expA; if ( ! expZ ) expZ = 1; sigExtra = 0; if ( sigB < sigA ) goto aBigger; if ( sigA < sigB ) goto bBigger; uiZ64 = packToExtF80UI64( (state->roundingMode == softfloat_round_min), 0 ); uiZ0 = 0; goto uiZ; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expBBigger: if ( expB == 0x7FFF ) { if ( sigB & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; uiZ64 = packToExtF80UI64( signZ ^ 1, 0x7FFF ); uiZ0 = UINT64_C( 0x8000000000000000 ); goto uiZ; } if ( ! expA ) { ++expDiff; sigExtra = 0; if ( ! expDiff ) goto newlyAlignedBBigger; } sig128 = softfloat_shiftRightJam128( sigA, 0, -expDiff ); sigA = sig128.v64; sigExtra = sig128.v0; newlyAlignedBBigger: expZ = expB; bBigger: signZ = ! signZ; sig128 = softfloat_sub128( sigB, 0, sigA, sigExtra ); goto normRoundPack; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ expABigger: if ( expA == 0x7FFF ) { if ( sigA & UINT64_C( 0x7FFFFFFFFFFFFFFF ) ) goto propagateNaN; uiZ64 = uiA64; uiZ0 = uiA0; goto uiZ; } if ( ! expB ) { --expDiff; sigExtra = 0; if ( ! expDiff ) goto newlyAlignedABigger; } sig128 = softfloat_shiftRightJam128( sigB, 0, expDiff ); sigB = sig128.v64; sigExtra = sig128.v0; newlyAlignedABigger: expZ = expA; aBigger: sig128 = softfloat_sub128( sigA, 0, sigB, sigExtra ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ normRoundPack: return softfloat_normRoundPackToExtF80( state, signZ, expZ, sig128.v64, sig128.v0, state->roundingPrecision ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ propagateNaN: uiZ = softfloat_propagateNaNExtF80UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ64 = uiZ.v64; uiZ0 = uiZ.v0; uiZ: uZ.s.signExp = uiZ64; uZ.s.signif = uiZ0; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/s_subMagsF128.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "platform.h" #include "internals.h" #include "specialize.h" #include "softfloat.h" float128_t softfloat_subMagsF128( struct softfloat_state *state, uint_fast64_t uiA64, uint_fast64_t uiA0, uint_fast64_t uiB64, uint_fast64_t uiB0, bool signZ ) { int_fast32_t expA; struct uint128 sigA; int_fast32_t expB; struct uint128 sigB, sigZ; int_fast32_t expDiff, expZ; struct uint128 uiZ; union ui128_f128 uZ; expA = expF128UI64( uiA64 ); sigA.v64 = fracF128UI64( uiA64 ); sigA.v0 = uiA0; expB = expF128UI64( uiB64 ); sigB.v64 = fracF128UI64( uiB64 ); sigB.v0 = uiB0; sigA = softfloat_shortShiftLeft128( sigA.v64, sigA.v0, 4 ); sigB = softfloat_shortShiftLeft128( sigB.v64, sigB.v0, 4 ); expDiff = expA - expB; if ( 0 < expDiff ) goto expABigger; if ( expDiff < 0 ) goto expBBigger; if ( expA == 0x7FFF ) { if ( sigA.v64 | sigA.v0 | sigB.v64 | sigB.v0 ) goto propagateNaN; softfloat_raiseFlags( state, softfloat_flag_invalid ); uiZ.v64 = defaultNaNF128UI64; uiZ.v0 = defaultNaNF128UI0; goto uiZ; } expZ = expA; if ( ! expZ ) expZ = 1; if ( sigB.v64 < sigA.v64 ) goto aBigger; if ( sigA.v64 < sigB.v64 ) goto bBigger; if ( sigB.v0 < sigA.v0 ) goto aBigger; if ( sigA.v0 < sigB.v0 ) goto bBigger; uiZ.v64 = packToF128UI64( (state->roundingMode == softfloat_round_min), 0, 0 ); uiZ.v0 = 0; goto uiZ; expBBigger: if ( expB == 0x7FFF ) { if ( sigB.v64 | sigB.v0 ) goto propagateNaN; uiZ.v64 = packToF128UI64( signZ ^ 1, 0x7FFF, 0 ); uiZ.v0 = 0; goto uiZ; } if ( expA ) { sigA.v64 |= UINT64_C( 0x0010000000000000 ); } else { ++expDiff; if ( ! expDiff ) goto newlyAlignedBBigger; } sigA = softfloat_shiftRightJam128( sigA.v64, sigA.v0, -expDiff ); newlyAlignedBBigger: expZ = expB; sigB.v64 |= UINT64_C( 0x0010000000000000 ); bBigger: signZ = ! signZ; sigZ = softfloat_sub128( sigB.v64, sigB.v0, sigA.v64, sigA.v0 ); goto normRoundPack; expABigger: if ( expA == 0x7FFF ) { if ( sigA.v64 | sigA.v0 ) goto propagateNaN; uiZ.v64 = uiA64; uiZ.v0 = uiA0; goto uiZ; } if ( expB ) { sigB.v64 |= UINT64_C( 0x0010000000000000 ); } else { --expDiff; if ( ! expDiff ) goto newlyAlignedABigger; } sigB = softfloat_shiftRightJam128( sigB.v64, sigB.v0, expDiff ); newlyAlignedABigger: expZ = expA; sigA.v64 |= UINT64_C( 0x0010000000000000 ); aBigger: sigZ = softfloat_sub128( sigA.v64, sigA.v0, sigB.v64, sigB.v0 ); normRoundPack: return softfloat_normRoundPackToF128( state, signZ, expZ - 5, sigZ.v64, sigZ.v0 ); propagateNaN: uiZ = softfloat_propagateNaNF128UI( state, uiA64, uiA0, uiB64, uiB0 ); uiZ: uZ.ui = uiZ; return uZ.f; } ================================================ FILE: External/SoftFloat-3e/src/softfloat_raiseFlags.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include "platform.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Raises the exceptions specified by `flags'. Floating-point traps can be | defined here if desired. It is currently not possible for such a trap | to substitute a result value. If traps are not implemented, this routine | should be simply `softfloat_exceptionFlags |= flags;'. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_raiseFlags( struct softfloat_state *state, uint_fast8_t flags ) { state->exceptionFlags |= flags; } ================================================ FILE: External/SoftFloat-3e/src/specialize.h ================================================ /*============================================================================ This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2018 The Regents of the University of California. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #ifndef specialize_h #define specialize_h 1 #include #include #include "primitiveTypes.h" #include "softfloat.h" /*---------------------------------------------------------------------------- | Default value for 'softfloat_detectTininess'. *----------------------------------------------------------------------------*/ #define init_detectTininess softfloat_tininess_afterRounding /*---------------------------------------------------------------------------- | The values to return on conversions to 32-bit integer formats that raise an | invalid exception. *----------------------------------------------------------------------------*/ #define ui32_fromPosOverflow 0xFFFFFFFF #define ui32_fromNegOverflow 0xFFFFFFFF #define ui32_fromNaN 0xFFFFFFFF #define i32_fromPosOverflow (-0x7FFFFFFF - 1) #define i32_fromNegOverflow (-0x7FFFFFFF - 1) #define i32_fromNaN (-0x7FFFFFFF - 1) /*---------------------------------------------------------------------------- | The values to return on conversions to 64-bit integer formats that raise an | invalid exception. *----------------------------------------------------------------------------*/ #define ui64_fromPosOverflow UINT64_C( 0xFFFFFFFFFFFFFFFF ) #define ui64_fromNegOverflow UINT64_C( 0xFFFFFFFFFFFFFFFF ) #define ui64_fromNaN UINT64_C( 0xFFFFFFFFFFFFFFFF ) #define i64_fromPosOverflow (-INT64_C( 0x7FFFFFFFFFFFFFFF ) - 1) #define i64_fromNegOverflow (-INT64_C( 0x7FFFFFFFFFFFFFFF ) - 1) #define i64_fromNaN (-INT64_C( 0x7FFFFFFFFFFFFFFF ) - 1) /*---------------------------------------------------------------------------- | "Common NaN" structure, used to transfer NaN representations from one format | to another. *----------------------------------------------------------------------------*/ struct commonNaN { bool sign; #ifdef LITTLEENDIAN uint64_t v0, v64; #else uint64_t v64, v0; #endif }; /*---------------------------------------------------------------------------- | The bit pattern for a default generated 16-bit floating-point NaN. *----------------------------------------------------------------------------*/ #define defaultNaNF16UI 0xFE00 /*---------------------------------------------------------------------------- | Returns true when 16-bit unsigned integer 'uiA' has the bit pattern of a | 16-bit floating-point signaling NaN. | Note: This macro evaluates its argument more than once. *----------------------------------------------------------------------------*/ #define softfloat_isSigNaNF16UI( uiA ) ((((uiA) & 0x7E00) == 0x7C00) && ((uiA) & 0x01FF)) /*---------------------------------------------------------------------------- | Assuming 'uiA' has the bit pattern of a 16-bit floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by 'zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ void softfloat_f16UIToCommonNaN( uint_fast16_t uiA, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into a 16-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ uint_fast16_t softfloat_commonNaNToF16UI( const struct commonNaN *aPtr ); /*---------------------------------------------------------------------------- | Interpreting 'uiA' and 'uiB' as the bit patterns of two 16-bit floating- | point values, at least one of which is a NaN, returns the bit pattern of | the combined NaN result. If either 'uiA' or 'uiB' has the pattern of a | signaling NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ uint_fast16_t softfloat_propagateNaNF16UI( uint_fast16_t uiA, uint_fast16_t uiB ); /*---------------------------------------------------------------------------- | The bit pattern for a default generated 32-bit floating-point NaN. *----------------------------------------------------------------------------*/ #define defaultNaNF32UI 0xFFC00000 /*---------------------------------------------------------------------------- | Returns true when 32-bit unsigned integer 'uiA' has the bit pattern of a | 32-bit floating-point signaling NaN. | Note: This macro evaluates its argument more than once. *----------------------------------------------------------------------------*/ #define softfloat_isSigNaNF32UI( uiA ) ((((uiA) & 0x7FC00000) == 0x7F800000) && ((uiA) & 0x003FFFFF)) /*---------------------------------------------------------------------------- | Assuming 'uiA' has the bit pattern of a 32-bit floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by 'zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_f32UIToCommonNaN( struct softfloat_state *, uint_fast32_t uiA, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into a 32-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR uint_fast32_t softfloat_commonNaNToF32UI( const struct commonNaN *aPtr ); /*---------------------------------------------------------------------------- | Interpreting 'uiA' and 'uiB' as the bit patterns of two 32-bit floating- | point values, at least one of which is a NaN, returns the bit pattern of | the combined NaN result. If either 'uiA' or 'uiB' has the pattern of a | signaling NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ uint_fast32_t softfloat_propagateNaNF32UI( uint_fast32_t uiA, uint_fast32_t uiB ); /*---------------------------------------------------------------------------- | The bit pattern for a default generated 64-bit floating-point NaN. *----------------------------------------------------------------------------*/ #define defaultNaNF64UI UINT64_C( 0xFFF8000000000000 ) /*---------------------------------------------------------------------------- | Returns true when 64-bit unsigned integer 'uiA' has the bit pattern of a | 64-bit floating-point signaling NaN. | Note: This macro evaluates its argument more than once. *----------------------------------------------------------------------------*/ #define softfloat_isSigNaNF64UI( uiA ) ((((uiA) & UINT64_C( 0x7FF8000000000000 )) == UINT64_C( 0x7FF0000000000000 )) && ((uiA) & UINT64_C( 0x0007FFFFFFFFFFFF ))) /*---------------------------------------------------------------------------- | Assuming 'uiA' has the bit pattern of a 64-bit floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by 'zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_f64UIToCommonNaN( struct softfloat_state *, uint_fast64_t uiA, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into a 64-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR uint_fast64_t softfloat_commonNaNToF64UI( const struct commonNaN *aPtr ); /*---------------------------------------------------------------------------- | Interpreting 'uiA' and 'uiB' as the bit patterns of two 64-bit floating- | point values, at least one of which is a NaN, returns the bit pattern of | the combined NaN result. If either 'uiA' or 'uiB' has the pattern of a | signaling NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ uint_fast64_t softfloat_propagateNaNF64UI( uint_fast64_t uiA, uint_fast64_t uiB ); /*---------------------------------------------------------------------------- | The bit pattern for a default generated 80-bit extended floating-point NaN. *----------------------------------------------------------------------------*/ #define defaultNaNExtF80UI64 0xFFFF #define defaultNaNExtF80UI0 UINT64_C( 0xC000000000000000 ) /*---------------------------------------------------------------------------- | Returns true when the 80-bit unsigned integer formed from concatenating | 16-bit 'uiA64' and 64-bit 'uiA0' has the bit pattern of an 80-bit extended | floating-point signaling NaN. | Note: This macro evaluates its arguments more than once. *----------------------------------------------------------------------------*/ #define softfloat_isSigNaNExtF80UI( uiA64, uiA0 ) ((((uiA64) & 0x7FFF) == 0x7FFF) && ! ((uiA0) & UINT64_C( 0x4000000000000000 )) && ((uiA0) & UINT64_C( 0x3FFFFFFFFFFFFFFF ))) #ifdef SOFTFLOAT_FAST_INT64 /*---------------------------------------------------------------------------- | The following functions are needed only when 'SOFTFLOAT_FAST_INT64' is | defined. *----------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------- | Assuming the unsigned integer formed from concatenating 'uiA64' and 'uiA0' | has the bit pattern of an 80-bit extended floating-point NaN, converts | this NaN to the common NaN form, and stores the resulting common NaN at the | location pointed to by 'zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_extF80UIToCommonNaN( struct softfloat_state *, uint_fast16_t uiA64, uint_fast64_t uiA0, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into an 80-bit extended | floating-point NaN, and returns the bit pattern of this value as an unsigned | integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_commonNaNToExtF80UI( const struct commonNaN *aPtr ); /*---------------------------------------------------------------------------- | Interpreting the unsigned integer formed from concatenating 'uiA64' and | 'uiA0' as an 80-bit extended floating-point value, and likewise interpreting | the unsigned integer formed from concatenating 'uiB64' and 'uiB0' as another | 80-bit extended floating-point value, and assuming at least on of these | floating-point values is a NaN, returns the bit pattern of the combined NaN | result. If either original floating-point value is a signaling NaN, the | invalid exception is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_propagateNaNExtF80UI( struct softfloat_state *, uint_fast16_t uiA64, uint_fast64_t uiA0, uint_fast16_t uiB64, uint_fast64_t uiB0 ); /*---------------------------------------------------------------------------- | The bit pattern for a default generated 128-bit floating-point NaN. *----------------------------------------------------------------------------*/ #define defaultNaNF128UI64 UINT64_C( 0xFFFF800000000000 ) #define defaultNaNF128UI0 UINT64_C( 0 ) /*---------------------------------------------------------------------------- | Returns true when the 128-bit unsigned integer formed from concatenating | 64-bit 'uiA64' and 64-bit 'uiA0' has the bit pattern of a 128-bit floating- | point signaling NaN. | Note: This macro evaluates its arguments more than once. *----------------------------------------------------------------------------*/ #define softfloat_isSigNaNF128UI( uiA64, uiA0 ) ((((uiA64) & UINT64_C( 0x7FFF800000000000 )) == UINT64_C( 0x7FFF000000000000 )) && ((uiA0) || ((uiA64) & UINT64_C( 0x00007FFFFFFFFFFF )))) /*---------------------------------------------------------------------------- | Assuming the unsigned integer formed from concatenating 'uiA64' and 'uiA0' | has the bit pattern of a 128-bit floating-point NaN, converts this NaN to | the common NaN form, and stores the resulting common NaN at the location | pointed to by 'zPtr'. If the NaN is a signaling NaN, the invalid exception | is raised. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR void softfloat_f128UIToCommonNaN( struct softfloat_state *, uint_fast64_t uiA64, uint_fast64_t uiA0, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into a 128-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ FEXCORE_PRESERVE_ALL_ATTR struct uint128 softfloat_commonNaNToF128UI( const struct commonNaN * ); /*---------------------------------------------------------------------------- | Interpreting the unsigned integer formed from concatenating 'uiA64' and | 'uiA0' as a 128-bit floating-point value, and likewise interpreting the | unsigned integer formed from concatenating 'uiB64' and 'uiB0' as another | 128-bit floating-point value, and assuming at least on of these floating- | point values is a NaN, returns the bit pattern of the combined NaN result. | If either original floating-point value is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ struct uint128 softfloat_propagateNaNF128UI( struct softfloat_state *, uint_fast64_t uiA64, uint_fast64_t uiA0, uint_fast64_t uiB64, uint_fast64_t uiB0 ); #else /*---------------------------------------------------------------------------- | The following functions are needed only when 'SOFTFLOAT_FAST_INT64' is not | defined. *----------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------- | Assuming the 80-bit extended floating-point value pointed to by 'aSPtr' is | a NaN, converts this NaN to the common NaN form, and stores the resulting | common NaN at the location pointed to by 'zPtr'. If the NaN is a signaling | NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ void softfloat_extF80MToCommonNaN( const struct extFloat80M *aSPtr, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into an 80-bit extended | floating-point NaN, and stores this NaN at the location pointed to by | 'zSPtr'. *----------------------------------------------------------------------------*/ void softfloat_commonNaNToExtF80M( const struct commonNaN *aPtr, struct extFloat80M *zSPtr ); /*---------------------------------------------------------------------------- | Assuming at least one of the two 80-bit extended floating-point values | pointed to by 'aSPtr' and 'bSPtr' is a NaN, stores the combined NaN result | at the location pointed to by 'zSPtr'. If either original floating-point | value is a signaling NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ void softfloat_propagateNaNExtF80M( const struct extFloat80M *aSPtr, const struct extFloat80M *bSPtr, struct extFloat80M *zSPtr ); /*---------------------------------------------------------------------------- | The bit pattern for a default generated 128-bit floating-point NaN. *----------------------------------------------------------------------------*/ #define defaultNaNF128UI96 0xFFFF8000 #define defaultNaNF128UI64 0 #define defaultNaNF128UI32 0 #define defaultNaNF128UI0 0 /*---------------------------------------------------------------------------- | Assuming the 128-bit floating-point value pointed to by 'aWPtr' is a NaN, | converts this NaN to the common NaN form, and stores the resulting common | NaN at the location pointed to by 'zPtr'. If the NaN is a signaling NaN, | the invalid exception is raised. Argument 'aWPtr' points to an array of | four 32-bit elements that concatenate in the platform's normal endian order | to form a 128-bit floating-point value. *----------------------------------------------------------------------------*/ void softfloat_f128MToCommonNaN( const uint32_t *aWPtr, struct commonNaN *zPtr ); /*---------------------------------------------------------------------------- | Converts the common NaN pointed to by 'aPtr' into a 128-bit floating-point | NaN, and stores this NaN at the location pointed to by 'zWPtr'. Argument | 'zWPtr' points to an array of four 32-bit elements that concatenate in the | platform's normal endian order to form a 128-bit floating-point value. *----------------------------------------------------------------------------*/ void softfloat_commonNaNToF128M( const struct commonNaN *aPtr, uint32_t *zWPtr ); /*---------------------------------------------------------------------------- | Assuming at least one of the two 128-bit floating-point values pointed to by | 'aWPtr' and 'bWPtr' is a NaN, stores the combined NaN result at the location | pointed to by 'zWPtr'. If either original floating-point value is a | signaling NaN, the invalid exception is raised. Each of 'aWPtr', 'bWPtr', | and 'zWPtr' points to an array of four 32-bit elements that concatenate in | the platform's normal endian order to form a 128-bit floating-point value. *----------------------------------------------------------------------------*/ void softfloat_propagateNaNF128M( const uint32_t *aWPtr, const uint32_t *bWPtr, uint32_t *zWPtr ); #endif #endif ================================================ FILE: External/SoftFloat-3e/src/ui64_to_extF80.c ================================================ /*============================================================================ This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic Package, Release 3e, by John R. Hauser. Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of California. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include "platform.h" #include "internals.h" #include "softfloat.h" FEXCORE_PRESERVE_ALL_ATTR extFloat80_t ui64_to_extF80( uint64_t a ) { uint_fast16_t uiZ64; int_fast8_t shiftDist; union { struct extFloat80M s; extFloat80_t f; } uZ; uiZ64 = 0; if ( a ) { shiftDist = softfloat_countLeadingZeros64( a ); uiZ64 = 0x403E - shiftDist; a <<= shiftDist; } uZ.s.signExp = uiZ64; uZ.s.signif = a; return uZ.f; } ================================================ FILE: External/cephes/CMakeLists.txt ================================================ add_library(cephes_128bit STATIC src/128bit/Impl.cpp src/128bit/atanll.c src/128bit/constll.c src/128bit/exp2ll.c src/128bit/floorll.c src/128bit/log2ll.c src/128bit/mtherr.c src/128bit/polevll.c src/128bit/sinll.c src/128bit/tanll.c) # 128-bit library target_link_libraries(cephes_128bit softfloat_3e) target_include_directories(cephes_128bit PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/) target_compile_options(cephes_128bit PRIVATE -fno-builtin) ================================================ FILE: External/cephes/LICENSE ================================================ The cephes math library is BSD licensed. The source can be accessed from https://www.netlib.org/cephes/ Original license from https://www.netlib.org/cephes/readme : > Some software in this archive may be from the book _Methods and > Programs for Mathematical Functions_ (Prentice-Hall or Simon & Schuster > International, 1989) or from the Cephes Mathematical Library, a > commercial product. In either event, it is copyrighted by the author. > What you see here may be used freely but it comes with no support or > guarantee. > > The two known misprints in the book are repaired here in the > source listings for the gamma function and the incomplete beta > integral. > > > Stephen L. Moshier > moshier@na-net.ornl.gov The author was e-mailed and they allowed it to be relicensed under BSD. Resources: https://bugs.gentoo.org/687276 https://lists.debian.org/debian-legal/2004/12/msg00295.html https://github.com/deepmind/torch-cephes/blob/master/LICENSE.txt https://github.com/nearform/node-cephes/blob/master/LICENSE E-mail snippit from torch-cephes source: Return-Path: X-Original-To: julien@cornebise.com Delivered-To: julien@cornebise.com Received: from atl4mhob11.myregisteredsite.com (atl4mhob11.myregisteredsite.com [209.17.115.49]) by cornebise.com (Postfix) with ESMTP id D47B139FC0 for ; Fri, 25 Oct 2013 16:32:40 +0200 (CEST) Received: from mailpod1.hostingplatform.com ([10.30.71.116]) by atl4mhob11.myregisteredsite.com (8.14.4/8.14.4) with ESMTP id r9PEWcwQ003543 for ; Fri, 25 Oct 2013 10:32:38 -0400 Received: (qmail 11948 invoked by uid 0); 25 Oct 2013 12:36:20 -0000 X-TCPREMOTEIP: 76.24.25.74 X-Authenticated-UID: steve@moshier.net Received: from unknown (HELO d510.local) (steve@moshier.net@76.24.25.74) by 0 with ESMTPA; 25 Oct 2013 12:36:20 -0000 Date: Fri, 25 Oct 2013 08:36:19 -0400 (EDT) From: Stephen Moshier X-X-Sender: steve@d510 To: Julien Cornebise Subject: Re: Cephes: permission to wrap+distribute for Lua In-Reply-To: <52653AD3.1010004@cornebise.com> Message-ID: References: <52653AD3.1010004@cornebise.com> User-Agent: Alpine 2.02 (DEB 1266 2009-07-14) MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII; format=flowed Julien, thank you for writing. BSD license is fine, modification is OK. There are more build scripts available in the web site distributions than there are on the Netlib. I think there is an update to Planck's radiation function that I haven't sent to Netlib yet. But Netlib is a more stable site, so it is better to cite that as a reference. On Mon, 21 Oct 2013, Julien Cornebise wrote: > -----BEGIN PGP SIGNED MESSAGE----- > Hash: SHA1 > > Dear Mr Moshier > > I am a researcher in mathematics and machine learning in London, and > am writing about your awesome Cephes library, whom I found at the > heart of Scipy. > > It is so useful that, with your permission, I would like to wrap it > for Lua and Torch (a machine learning overlay to Lua, specialized in > neural nets, see http://www.torch.ch). I would like to distribute it > as a package for Torch, including your source code along the wrapping > code. > This wouldbe a public package, distributed under BSD License. I have > put a first draft on github: > https://github.com/jucor/torch-cephes > > Hence my three questions, please: > > 1/ How would you like to be acknowledged, beyond the comments that are > already in your code? Do you have any standard header/disclaimer that > I could add to the documentation? > > 2/ At the moment, your code is left untouched. However, if I ever need > to modify bits of the code, what are the conditions/restrictions? > Nothing huge -- I definitely do not want to mess with it: I was > planning to use the natural completion of some functions on the > completed real line (e.g. CDF returing 1 when called with "infinity", > or quantiles returning -Infinity when called with 0), either natively > if supported, or by setting a specific flag via mtherr(). > > 3/ I am currently using the source from Netlib. Do you recommend using > the source from your website instead ? > > Thank you very much for your attention, > and, more importantly, for the time and effort your poured into Cephes. > > Best regards, > > Julien Cornebise, Ph.D. > London, UK > http://www.cornebise.com/julien > -----BEGIN PGP SIGNATURE----- > Version: GnuPG v1.4.14 (Darwin) > Comment: GPGTools - http://gpgtools.org > Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/ > > iEYEARECAAYFAlJlOtEACgkQKYR3gC0rw/gIpQCfZKu6+iDh9ghhm6QfsLXnldKN > BuIAn2zZHu1c/IrRAevhjM7N7xGg0LHO > =WeP5 > -----END PGP SIGNATURE----- ================================================ FILE: External/cephes/include/cephes_128bit.h ================================================ #pragma once extern "C" { #include "SoftFloat-3e/platform.h" #include "SoftFloat-3e/softfloat.h" } namespace FEXCore::cephes_128bit { float128_t atan2l(float128_t y, float128_t x); float128_t cosl(float128_t x); float128_t exp2l(float128_t x); float128_t log2l(float128_t x); float128_t sinl(float128_t x); float128_t tanl(float128_t x); } ================================================ FILE: External/cephes/src/128bit/Impl.cpp ================================================ #include "cephes_128bit.h" extern "C" { // cephes_128bit functions float128_t cephes_f128_atan2l(float128_t y, float128_t x); float128_t cephes_f128_cosl(float128_t x); float128_t cephes_f128_exp2l(float128_t x); float128_t cephes_f128_log2l(float128_t x); float128_t cephes_f128_sinl(float128_t x); float128_t cephes_f128_tanl(float128_t x); } namespace FEXCore::cephes_128bit { float128_t atan2l(float128_t y, float128_t x) { return cephes_f128_atan2l(y, x); } float128_t cosl(float128_t x) { return cephes_f128_cosl(x); } float128_t exp2l(float128_t x) { return cephes_f128_exp2l(x); } float128_t log2l(float128_t x) { return cephes_f128_log2l(x); } float128_t sinl(float128_t x) { return cephes_f128_sinl(x); } float128_t tanl(float128_t x) { return cephes_f128_tanl(x); } } ================================================ FILE: External/cephes/src/128bit/atanll.c ================================================ /* atanl.c * * Inverse circular tangent, 128-bit float128_t precision * (arctangent) * * * * SYNOPSIS: * * float128_t x, y, atanl(); * * y = atanl( x ); * * * * DESCRIPTION: * * Returns radian angle between -pi/2 and +pi/2 whose tangent * is x. * * Range reduction is from four intervals into the interval * from zero to tan( pi/8 ). The approximant uses a rational * function of degree 3/4 of the form x + x**3 P(x)/Q(x). * * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE -10, 10 100,000 2.6e-34 6.5e-35 * */ /* atan2l() * * Quadrant correct inverse circular tangent, * float128_t precision * * * * SYNOPSIS: * * float128_t x, y, z, atan2l(); * * z = atan2l( y, x ); * * * * DESCRIPTION: * * Returns radian angle whose tangent is y/x. * Define compile time symbol ANSIC = 1 for ANSI standard, * range -PI < z <= +PI, args (y,x); else ANSIC = 0 for range * 0 to 2PI, args (x,y). * * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE -10, 10 100,000 3.2e-34 5.9e-35 * See atan.c. * */ /* atan.c */ /* Cephes Math Library Release 2.2: December, 1990 Copyright 1984, 1990 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" /* arctan(x) = x + x^3 P(x^2) * Theoretical peak relative error = 3.0e-36 * relative peak error spread = 6.6e-8 */ static float128_t P[9] = { {0xf3f0105b1dae46bfULL, 0xbff45be85838aa26ULL}, // -6.635810778635296712545011270011752799963E-4L, {0x529a2bf25f15874bULL, 0xbffec0f17ae68a18ULL}, // -8.768423468036849091777415076702113400070E-1L, {0x3054a2e7144e265cULL, 0xc00397b0dc1f4d10ULL}, // -2.548067867495502632615671450650071218995E1L, {0x1e19d6b8c5cd9e65ULL, 0xc006f38d4e47779aULL}, // -2.497759878476618348858065206895055957104E2L, {0x69dcb1e41a413bddULL, 0xc0091f0a8586c642ULL}, // -1.148164399808514330375280133523543970854E3L, {0x501d0f5157516744ULL, 0xc00a5d08ba650145ULL}, // -2.792272753241044941703278827346430350236E3L, {0x16f18bf3f5b4b987ULL, 0xc00ace087656cfbeULL}, // -3.696264445691821235400930243493001671932E3L, {0x2966de608cbf9696ULL, 0xc00a3a5a8d629fc7ULL}, // -2.514829758941713674909996882101723647996E3L, {0xeb77db69572ecd22ULL, 0xc0085807a6c98431ULL}, // -6.880597774405940432145577545328795037141E2L }; static float128_t Q[8] = { /* 1.000000000000000000000000000000000000000E0L, */ {0x0cc994a760137543ULL, 0x40041d4c974b22bcULL}, // 3.566239794444800849656497338030115886153E1L, {0xa5b186c10b6a065eULL, 0x4007aed5b7e20c37ULL}, // 4.308348370818927353321556740027020068897E2L, {0x8711ebf202296129ULL, 0x400a37d5c6fdd0cdULL}, // 2.494680540950601626662048893678584497900E3L, {0x02d59339ee4eee21ULL, 0x400bef892855649eULL}, // 7.928572347062145288093560392463784743935E3L, {0xd9b903b0950fefb3ULL, 0x400cc7c8d1c45b09ULL}, // 1.458510242529987155225086911411015961174E4L, {0x174d6e0dae833752ULL, 0x400ce38f8ba0a897ULL}, // 1.547394317752562611786521896296215170819E4L, {0xfcbdd5dddcf7c68cULL, 0x400c1277f99a3d1aULL}, // 8.782996876218210302516194604424986107121E3L, {0x7099e48f01631a53ULL, 0x400a0205bd172325ULL}, // 2.064179332321782129643673263598686441900E3L }; /* tan( 3*pi/8 ) */ static float128_t T3P8 = {0x6484597d89b3754bULL, 0x40003504f333f9deULL}; /* tan( pi/8 ) */ static float128_t TP8 = {0x2422cbec4d9baa56ULL, 0x3ffda827999fcef3ULL}; static const float128_t zero = {0, 0}; static const float128_t one = {0, 0x3fff000000000000ULL}; __attribute__((unused)) static const float128_t f_2_p0 = {0x0000000000000000ULL, 0x4000000000000000ULL}; __attribute__((unused)) static const float128_t f_3_p0 = {0x0000000000000000ULL, 0x4000800000000000ULL}; float128_t cephes_f128_atanl(float128_t x) { struct softfloat_state state = {}; float128_t y, z; short sign; /* make argument positive and save the sign */ sign = 1; if( f128_lt(&state, x, zero) ) { sign = -1; x = f128_complement_sign(x); } /* range reduction */ // if( x > T3P8 ) if( f128_lt(&state, T3P8, x) ) { y = F128_PIO2L; x = f128_complement_sign( f128_div(&state, one, x)); } else if( f128_lt(&state, TP8, x) ) { y = F128_PIO4L; x = f128_div(&state, f128_sub(&state, x, one), f128_add(&state, x, one)); } else y = zero; /* rational form in x**2 */ z = f128_mul(&state, x, x); y = f128_add(&state, f128_add(&state, y, f128_mul(&state, f128_mul(&state, f128_div(&state, cephes_f128_polevll( z, P, 8 ), cephes_f128_p1evll( z, Q, 8 ) ), z), x)), x); if( sign < 0 ) y = f128_complement_sign(y); return(y); } /* atan2 */ #if ANSIC float128_t cephes_f128_atan2l( float128_t y, float128_t x ) #else float128_t cephes_f128_atan2l( float128_t x, float128_t y ) #endif { struct softfloat_state state = {}; float128_t z, w; short code; code = 0; w = zero; if( f128_lt(&state, x, zero) ) code = 2; if( f128_lt(&state, y, zero) ) code |= 1; if( f128_eq(&state, x, zero) ) { if( code & 1 ) { #if ANSIC return( f128_complement_sign(F128_PIO2L) ); #else return( f128_mul(&state, f_3_p0, F128_PIO2L) ); #endif } if( f128_eq(&state, y, zero) ) return zero; return( F128_PIO2L ); } if( f128_eq(&state, y, zero) ) { if( code & 2 ) return( F128_PIL ); return zero; } switch( code ) { #if ANSIC case 0: case 1: w = zero; break; case 2: w = F128_PIL; break; case 3: w = f128_complement_sign(F128_PIL); break; #else case 0: w = zero; break; case 1: w = f128_mul(&state, f_2_p0, F128_PIL); break; case 2: case 3: w = F128_PIL; break; #endif } z = cephes_f128_atanl( f128_div(&state, y, x) ); return f128_add(&state, w, z ); } ================================================ FILE: External/cephes/src/128bit/constll.c ================================================ #include "mconf.h" /* (1 - 2^-113) 2^16384 */ float128_t F128_MAXNUML = {0xffffffffffffffffULL, 0x7ffeffffffffffffULL}; //1.189731495357231765085759326628007016196469e4932L; /* 2^-113 */ float128_t F128_MACHEPL = {0x0000000000000000ULL, 0x3f8e000000000000ULL}; // 9.629649721936179265279889712924636592690508e-35L; /* (1 + 2^-112) 2^-16382 */ float128_t F128_UFTHRESHL = {0x0000000000000001ULL, 0x0001000000000000ULL}; // 3.362103143112093506262677817321753250115591e-4932L; /* 2^-16494 */ float128_t F128_MINNUML = {0x0000000000000001ULL, 0x0000000000000000ULL}; // 6.475175119438025110924438958227646552499569e-4966L; /* ln(MAXNUM) */ float128_t F128_MAXLOGL = {0xf35793c7673007e6ULL, 0x400c62e42fefa39eULL}; // 1.1356523406294143949491931077970764891253E4L; /* ln(MINNUM) */ float128_t F128_MINLOGL = {0x2c89d24d65e96274ULL, 0xc00c654628220780ULL}; // -1.143276959615573793352782661133116431383730e4L; /* ln(UFTHRESH) */ /* float128_t F128_MINLOGL = -1.135513711193302405887309661372784853802025e4L; */ float128_t F128_PIL = {0x8469898cc51701b8ULL, 0x4000921fb54442d1ULL}; // 3.141592653589793238462643383279502884197169L; float128_t F128_PIO2L = {0x8469898cc51701b8ULL, 0x3fff921fb54442d1ULL}; // 1.570796326794896619231321691639751442098585L; float128_t F128_PIO4L = {0x8469898cc51701b8ULL, 0x3ffe921fb54442d1ULL}; // 0.7853981633974483096156608458198757210492923L; float128_t F128_LOGE2L = {0xf35793c7673007e6ULL, 0x3ffe62e42fefa39eULL}; // 0.6931471805599453094172321214581765680755001L; float128_t F128_LOG2EL = {0xe1777d0ffda0d23aULL, 0x3fff71547652b82fULL}; // 1.442695040888963407359924681001892137426646L; float128_t F128_INFINITYL = {0x0000000000000000ULL, 0x7fff000000000000ULL}; // 1.0L / 0.0L; ================================================ FILE: External/cephes/src/128bit/exp2ll.c ================================================ /* exp2l.c * * Base 2 exponential function, 128-bit float128_t precision * * * * SYNOPSIS: * * float128_t x, y, exp2l(); * * y = exp2l( x ); * * * * DESCRIPTION: * * Returns 2 raised to the x power. * * Range reduction is accomplished by separating the argument * into an integer k and fraction f such that * x k f * 2 = 2 2. * * A Pade' form * * 1 + 2x P(x**2) / (Q(x**2) - x P(x**2) ) * * approximates 2**x in the basic range [-0.5, 0.5]. * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE +-16300 100,000 2.0e-34 4.8e-35 * * * See exp.c for comments on error amplification. * * * ERROR MESSAGES: * * message condition value returned * exp2l underflow x < -16382 0.0 * exp2l overflow x >= 16384 MAXNUM * */ /* Cephes Math Library Release 2.2: January, 1991 Copyright 1984, 1991 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" static char fname[] = {"exp2l"}; /* Pade' coefficients for 2^x - 1 Theoretical peak relative error = 1.4e-40, relative peak error spread = 6.8e-14 */ static float128_t P[5] = { {0x3008ca100ca13471ULL, 0x40063d6f2f556577ULL}, // 1.587171580015525194694938306936721666031E2L, {0x9fac10fe43d72769ULL, 0x40122e00e88b4606ULL}, // 6.185032670011643762127954396427045467506E5L, {0x4c22cf0c6c7a8fc7ULL, 0x401c0eb996d98ba4ULL}, // 5.677513871931844661829755443994214173883E8L, {0x4acd9b1339dda08aULL, 0x40241d19e728a6beULL}, // 1.530625323728429161131811299626419117557E11L, {0xae406b996488ba7aULL, 0x402a0840400c1c84ULL}, // 9.079594442980146270952372234833529694788E12L }; static float128_t Q[5] = { /* 1.000000000000000000000000000000000000000E0L, */ {0xcf48c9db239c2189ULL, 0x400c827029417a6aULL}, // 1.236602014442099053716561665053645270207E4L, {0xb20f61f9a3c778b9ULL, 0x40174d9860120d5dULL}, // 2.186249607051644894762167991800811827835E7L, {0x9f361a3e85f209ceULL, 0x4020457bc8296e4eULL}, // 1.092141473886177435056423606755843616331E10L, {0x2dcf78c66f0a65ddULL, 0x40275b0c5bcbd7a7ULL}, // 1.490560994263653042761789432690793026977E12L, {0x4e4a9905cf9c9235ULL, 0x402b7d3bcb89794eULL}, // 2.619817175234089411411070339065679229869E13L }; static const float128_t MAXL2 = {0x0000000000000000ULL, 0x400d000000000000ULL}; static const float128_t MINL2 = {0x0000000000000000ULL, 0xc00cfff000000000ULL}; static const float128_t zero = {0, 0}; static const float128_t f_0_p5 = {0, 0x3ffe000000000000ULL}; static const float128_t one = {0, 0x3fff000000000000ULL}; extern float128_t F128_MAXNUML; float128_t cephes_f128_exp2l(float128_t x) { struct softfloat_state state = {}; float128_t px, xx; int n; if( f128_le(&state, MAXL2, x)) { mtherr( fname, OVERFLOW ); return( F128_MAXNUML ); } if(f128_lt(&state, x, MINL2) ) { mtherr( fname, UNDERFLOW ); return zero; } xx = x; /* save x */ /* separate into integer and fractional parts */ px = cephes_f128_floorl(f128_add(&state, x, f_0_p5)); n = f128_to_i32(&state, px, softfloat_round_near_even, true); x = f128_sub(&state, x, px); /* rational approximation * exp2(x) = 1.0 + 2xP(xx)/(Q(xx) - P(xx)) * where xx = x**2 */ xx = f128_mul(&state, x, x); px = f128_mul(&state, x, cephes_f128_polevll( xx, P, 4 )); x = f128_div(&state, px, f128_sub(&state, cephes_f128_p1evll( xx, Q, 5 ), px)); x = f128_add(&state, one, cephes_f128_ldexpl( x, 1 )); /* scale by power of 2 */ x = cephes_f128_ldexpl( x, n ); return(x); } ================================================ FILE: External/cephes/src/128bit/floorll.c ================================================ /* ceill() * floorl() * frexpl() * ldexpl() * fabsl() * signbitl() * isnanl() * isfinitel() * * Floating point numeric utilities * * * * SYNOPSIS: * * float128_t x, y; * float128_t ceill(), floorl(), frexpl(), ldexpl(), fabsl(); * int signbitl(), isnanl(), isfinitel(); * int expnt, n; * * y = floorl(x); * y = ceill(x); * y = frexpl( x, &expnt ); * y = ldexpl( x, n ); * y = fabsl( x ); * * * * DESCRIPTION: * * All four routines return a float128_t precision floating point * result. * * floorl() returns the largest integer less than or equal to x. * It truncates toward minus infinity. * * ceill() returns the smallest integer greater than or equal * to x. It truncates toward plus infinity. * * frexpl() extracts the exponent from x. It returns an integer * power of two to expnt and the significand between 0.5 and 1 * to y. Thus x = y * 2**expn. * * ldexpl() multiplies x by 2**n. * * fabsl() returns the absolute value of its argument. * * signbitl(x) returns 1 if the sign bit of x is 1, else 0. * * These functions are part of the standard C run time library * for some but not all C compilers. The ones supplied are * written in C for IEEE arithmetic. They should * be used only if your compiler library does not already have * them. * * The IEEE versions assume that denormal numbers are implemented * in the arithmetic. Some modifications will be required if * the arithmetic has abrupt rather than gradual underflow. */ /* Cephes Math Library Release 2.2: July, 1992 Copyright 1984, 1987, 1988, 1992 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" #define DENORMAL 1 #ifdef UNK char *unkmsg = "ceill(), floorl(), frexpl(), ldexpl() must be rewritten!\n"; #undef UNK #define MIEEE 1 #define EXPOFS 0 #endif #ifdef IBMPC #define NBITS 113 #define EXPOFS 7 #endif #ifdef MIEEE #define NBITS 113 #define EXPOFS 0 #endif extern float128_t F128_MAXNUML; static const float128_t zero = {0, 0}; static const float128_t f_0_p5 = {0, 0x3ffe000000000000ULL}; static const float128_t one = {0, 0x3fff000000000000ULL}; static const float128_t neg_one = {0, 0xbfff000000000000ULL}; static const float128_t f_2_p0 = {0, 0x4000000000000000ULL}; float128_t cephes_f128_fabsl(float128_t x) { struct softfloat_state state = {}; if( f128_lt(&state, x, zero) ) return f128_sub(&state, zero, x ); else return( x ); } float128_t cephes_f128_ceill(float128_t x) { float128_t y; #ifdef UNK mtherr( "ceill", DOMAIN ); return(0.0L); #endif struct softfloat_state state = {}; y = cephes_f128_floorl(x); if( f128_lt(&state, y, x) ) y = f128_add(&state, y, one); return(y); } /* Bit clearing masks: */ static unsigned short bmask[] = { 0xffff, 0xfffe, 0xfffc, 0xfff8, 0xfff0, 0xffe0, 0xffc0, 0xff80, 0xff00, 0xfe00, 0xfc00, 0xf800, 0xf000, 0xe000, 0xc000, 0x8000, 0x0000, }; float128_t cephes_f128_floorl(float128_t x) { union { float128_t y; unsigned short sh[8]; } u; int e, j; #ifdef UNK mtherr( "floor", DOMAIN ); return(0.0L); #endif struct softfloat_state state = {}; u.y = x; /* find the exponent (power of 2) */ e = (u.sh[EXPOFS] & 0x7fff) - 0x3fff; if( e < 0 ) { if( f128_lt(&state, u.y, zero) ) return neg_one; else return zero; } #ifdef IBMPC j = 0; #endif #ifdef MIEEE j = 7; #endif e = (NBITS - 1) - e; /* clean out 16 bits at a time */ while( e >= 16 ) { #ifdef IBMPC u.sh[j++] = 0; #endif #ifdef MIEEE u.sh[j--] = 0; #endif e -= 16; } /* clear the remaining bits */ if( e > 0 ) u.sh[j] &= bmask[e]; if( f128_lt(&state, x, zero) && !f128_eq(&state, u.y, x) ) u.y = f128_sub(&state, u.y, one);; return(u.y); } float128_t cephes_f128_frexpl( float128_t x, int *pw2 ) { union { float128_t y; unsigned short sh[8]; } u; int i, k; struct softfloat_state state = {}; u.y = x; #ifdef UNK mtherr( "frexp", DOMAIN ); return(0.0L); #endif /* find the exponent (power of 2) */ i = u.sh[EXPOFS] & 0x7fff; if( i == 0 ) { if( f128_eq(&state, u.y, zero)) { *pw2 = 0; return zero; } /* Number is denormal or zero */ #if DENORMAL /* Handle denormal number. */ do { u.y = f128_mul(&state, u.y, f_2_p0); i -= 1; k = u.sh[EXPOFS] & 0x7fff; } while( (k == 0) && (i > -115) ); i = i + k; #else *pw2 = 0; return(0.0L); #endif /* DENORMAL */ } *pw2 = i - 0x3ffe; u.sh[EXPOFS] = 0x3ffe; return( u.y ); } float128_t cephes_f128_ldexpl( float128_t x, int pw2 ) { union { float128_t y; unsigned short sh[8]; } u; long e; #ifdef UNK mtherr( "ldexp", DOMAIN ); return zero; #endif struct softfloat_state state = {}; u.y = x; while( (e = (u.sh[EXPOFS] & 0x7fffL)) == 0 ) { #if DENORMAL if( f128_eq(&state, u.y, zero)) { return zero; } /* Input is denormal. */ if( pw2 > 0 ) { u.y = f128_mul(&state, u.y, f_2_p0); pw2 -= 1; } if( pw2 < 0 ) { if( pw2 < -113 ) return zero; u.y = f128_sub(&state, u.y, f_0_p5); pw2 += 1; } if( pw2 == 0 ) return(u.y); #else return zero; #endif } e = e + pw2; /* Handle overflow */ if( e > 0x7ffeL ) { e = u.sh[EXPOFS]; u.y = zero; u.sh[EXPOFS] = e | 0x7fff; return( u.y ); } u.sh[EXPOFS] &= 0x8000; /* Handle denormalized results */ if( e < 1 ) { #if DENORMAL if( e < -113 ) return zero; u.sh[EXPOFS] |= 1; while( e < 1 ) { u.y = f128_sub(&state, u.y, f_0_p5); e += 1; } e = 0; #else return zero; #endif } u.sh[EXPOFS] |= e & 0x7fff; return(u.y); } /* Return 1 if x is a number that is Not a Number, else return 0. */ int cephes_f128_isnanl(float128_t x) { #ifdef NANS union { float128_t d; unsigned short s[8]; unsigned int i[4]; } u; u.d = x; if( sizeof(int) == 4 ) { #ifdef IBMPC if( ((u.s[7] & 0x7fff) == 0x7fff) && ((u.i[3] & 0x7fff) | u.i[2] | u.i[1] | u.i[0])) return 1; #endif #ifdef MIEEE if( ((u.i[0] & 0x7fff0000) == 0x7fff0000) && ((u.i[0] & 0x7fff) | u.i[1] | u.i[2] | u.i[3])) return 1; #endif return(0); } else { /* size int not 4 */ #ifdef IBMPC if( (u.s[7] & 0x7fff) == 0x7fff) { if((u.s[6] & 0x7fff) | u.s[5] | u.s[4] | u.s[3] | u.s[2] | u.s[1] | u.s[0]) return(1); } #endif #ifdef MIEEE if( (u.s[0] & 0x7fff) == 0x7fff) { if((u.s[1] & 0x7fff) | (u.s[2] & 0x7fff) | u.s[3] | u.s[4] | u.s[5] | u.s[6] | u.s[7]) return(1); } #endif return(0); } /* size int not 4 */ #else /* No NANS. */ return(0); #endif } /* Return 1 if x is not infinite and is not a NaN. */ int cephes_f128_isfinitel(float128_t x) { #ifdef INFINITIES union { float128_t d; unsigned short s[8]; unsigned int i[4]; } u; u.d = x; if( sizeof(int) == 4 ) { #ifdef IBMPC if( (u.s[7] & 0x7fff) != 0x7fff) return 1; #endif #ifdef MIEEE if( (u.i[0] & 0x7fff0000) != 0x7fff0000) return 1; #endif return(0); } else { #ifdef IBMPC if( (u.s[7] & 0x7fff) != 0x7fff) return 1; #endif #ifdef MIEEE if( (u.s[0] & 0x7fff) != 0x7fff) return 1; #endif return(0); } #else /* No INFINITY. */ return(1); #endif } /* Return 1 if the sign bit of x is 1, else 0. */ int cephes_f128_signbitl(float128_t x) { union { float128_t d; short s[8]; int i[4]; } u; u.d = x; if( sizeof(int) == 4 ) { #ifdef IBMPC return( u.s[7] < 0 ); #endif #ifdef DEC error no such DEC format #endif #ifdef MIEEE return( u.i[0] < 0 ); #endif } else { #ifdef IBMPC return( u.s[7] < 0 ); #endif #ifdef DEC error no such DEC format #endif #ifdef MIEEE return( u.s[0] < 0 ); #endif } } ================================================ FILE: External/cephes/src/128bit/log2ll.c ================================================ /* cephes_f128_log2l.c * * Base 2 logarithm, float128_t precision * * * * SYNOPSIS: * * float128_t x, y, cephes_f128_log2l(); * * y = cephes_f128_log2l( x ); * * * * DESCRIPTION: * * Returns the base 2 logarithm of x. * * The argument is separated into its exponent and fractional * parts. If the exponent is between -1 and +1, the (natural) * logarithm of the fraction is approximated by * * log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x). * * Otherwise, setting z = 2(x-1)/x+1), * * log(x) = z + z**3 P(z)/Q(z). * * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE 0.5, 2.0 100,000 1.3e-34 4.5e-35 * IEEE exp(+-10000) 100,000 9.6e-35 4.0e-35 * * In the tests over the interval exp(+-10000), the logarithms * of the random arguments were uniformly distributed over * [-10000, +10000]. * * ERROR MESSAGES: * * log singularity: x = 0; returns MINLOG * log domain: x < 0; returns MINLOG */ /* Cephes Math Library Release 2.2: January, 1991 Copyright 1984, 1991 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" static char fname[] = {"cephes_f128_log2l"}; /* Coefficients for ln(1+x) = x - x**2/2 + x**3 P(x)/Q(x) * 1/sqrt(2) <= x < sqrt(2) * Theoretical peak relative error = 5.3e-37, * relative peak error spread = 2.3e-14 */ static float128_t P[13] = { {0x95434922008560fcULL, 0x3feb9d04a0d6ed82ULL}, // 1.538612243596254322971797716843006400388E-6L {0x2e9cb5e91a8c2fa0ULL, 0x3ffdffd7e21347ccULL}, // 4.998469661968096229986658302195402690910E-1L {0x674c43ea62a592e7ULL, 0x400373615178fe96ULL}, // 2.321125933898420063925789532045674660756E1L {0xfa539715d5fd0560ULL, 0x40079b73a8639c28ULL}, // 4.114517881637811823002128927449878962058E2L {0x5ec5c60d38b7fa2aULL, 0x400ade1e79b3ae12ULL}, // 3.824952356185897735160588078446136783779E3L {0x6369f0cada64eeecULL, 0x400d4ca24f0550cfULL}, // 2.128857716871515081352991964243375186031E4L {0x115104b644c1f464ULL, 0x400f28a791822d40ULL}, // 7.594356839258970405033155585486712125861E4L {0x95ec43488121aff8ULL, 0x40105f196a49f171ULL}, // 1.797628303815655343403735250238293741397E5L {0xa2484b7171ab5034ULL, 0x401116caba9f2757ULL}, // 2.854829159639697837788887080758954924001E5L {0xe49b2bf8646a8a1eULL, 0x401125a72eb05ba7ULL}, // 3.007007295140399532324943111654767187848E5L {0x17ac5c737d1b8ad4ULL, 0x4010897ca319418dULL}, // 2.014652742082537582487669938141683759923E5L {0x9ff15925da76d408ULL, 0x400f2f8f8bfbf9a1ULL}, // 7.771154681358524243729929227226708890930E4L {0xe740b8544d79077cULL, 0x400c9a7dcad5d0efULL}, // 1.313572404063446165910279910527789794488E4L }; static float128_t Q[12] = { /* 1.000000000000000000000000000000000000000E0L, */ {0x4a2113daac8d7fa5ULL,0x40048322fbda4d3fULL}, // 4.839208193348159620282142911143429644326E1L, {0x9efb2fe2c778f56fULL,0x4008c73f14777e56ULL}, // 9.104928120962988414618126155557301584078E2L, {0xf23a98d434d3a705ULL,0x400c1dd933ea5565ULL}, // 9.147150349299596453976674231612674085381E3L, {0x4b44059a3b76f461ULL,0x400eb5f4d77aed02ULL}, // 5.605842085972455027590989944010492125825E4L, {0x2962234d48fff0bcULL,0x4010b71bb67f5effULL}, // 2.248234257620569139969141618556349415120E5L, {0xe673c713bcf24ee3ULL,0x40122b6c5ddac3b8ULL}, // 6.132189329546557743179177159925690841200E5L, {0x34d8d36e8de37c71ULL,0x40131ab83fa3b03bULL}, // 1.158019977462989115839826904108208787040E6L, {0x061338bb0e95b314ULL,0x401371d8273f762aULL}, // 1.514882452993549494932585972882995548426E6L, {0xe379b5d8e7071d74ULL,0x401348fbe89d38e2ULL}, // 1.347518538384329112529391120390701166528E6L, {0x412eafafea233277ULL,0x40127bc5211688c1ULL}, // 7.777690340007566932935753241556479363645E5L, {0x16378fd2514ba129ULL,0x40110088814003eaULL}, // 2.626900195321832660448791748036714883242E5L, {0xed708a3f3a1ac5caULL,0x400e33de58205cb3ULL}, // 3.940717212190338497730839731583397586124E4L }; /* Coefficients for log(x) = z + z^3 P(z^2)/Q(z^2), * where z = 2(x-1)/(x+1) * 1/sqrt(2) <= x < sqrt(2) * Theoretical peak relative error = 1.1e-35, * relative peak error spread 1.1e-9 */ static float128_t R[6] = { {0x68479d54e4ced708ULL, 0xbffec40a1c874f5aULL}, // -8.828896441624934385266096344596648080902E-1L, {0x565b5611a30df628ULL, 0x40054247b533971eULL}, // 8.057002716646055371965756206836056074715E1L, {0xb690eddd457e03b0ULL, 0xc009fa1350a9210eULL}, // -2.024301798136027039250415126250455056397E3L, {0xea1230d4dc2a41c8ULL, 0x400d4020cbb3c4edULL}, // 2.048819892795278657810231591630928516206E4L, {0x388e5d3ae806c32aULL, 0xc00f5eac94780e23ULL}, // -8.977257995689735303686582344659576526998E4L, {0x6802a6fb3250b4fdULL, 0x401014fab5e2e8c1ULL}, // 1.418134209872192732479751274970992665513E5L }; static float128_t S[6] = { /* 1.000000000000000000000000000000000000000E0L, */ {0x2575cd7cadd52c63ULL, 0xc005da8b34108b63ULL}, // -1.186359407982897997337150403816839480438E2L, {0x9022bf51e9d20aecULL, 0x400af3d0db24df08ULL}, // 3.998526750980007367835804959888064681098E3L, {0xeb27fc1032bb267dULL, 0xc00ec11ad77cc51cULL}, // -5.748542087379434595104154610899551484314E4L, {0xaeec5bd6a5211cbdULL, 0x401186c6f13df72eULL}, // 4.001557694070773974936904547424676279307E5L, {0xee9e91e4b3020178ULL, 0xc013455371e04bc5ULL}, // -1.332535117259762928288745111081235577029E6L, {0x1c03fa78cb791730ULL, 0x40139f7810d45d22ULL}, // 1.701761051846631278975701529965589676574E6L }; /* log2(e) - 1 */ static const float128_t LOG2EA = {0x85ddf43ff68348eaULL, 0x3ffdc551d94ae0bfULL}; static const float128_t SQRTH = {0xc908b2fb1366ea95ULL, 0x3ffe6a09e667f3bcULL}; static const float128_t zero = {0, 0}; static const float128_t f_0_p5 = {0, 0x3ffe000000000000ULL}; static const float128_t one = {0, 0x3fff000000000000ULL}; static const float128_t indeterminate = {0x0000000000000000ULL, 0xc00d000000000000ULL}; float128_t cephes_f128_log2l(float128_t x) { VOLATILE float128_t z; float128_t y; int e; struct softfloat_state state = {}; /* Test for domain */ if( f128_le(&state, x, zero) ) { if( f128_eq(&state, x, zero) ) mtherr( fname, SING ); else mtherr( fname, DOMAIN ); return indeterminate; } /* separate mantissa from exponent */ /* Note, frexp is used so that denormal numbers * will be handled properly. */ x = cephes_f128_frexpl( x, &e ); /* logarithm using log(x) = z + z**3 P(z)/Q(z), * where z = 2(x-1)/x+1) */ if( (e > 2) || (e < -2) ) { if( f128_lt(&state, x, SQRTH) ) { /* 2( 2x-1 )/( 2x+1 ) */ e -= 1; z = f128_sub(&state, x, f_0_p5); y = f128_add(&state, f128_mul(&state, f_0_p5, z), f_0_p5); } else { /* 2 (x-1)/(x+1) */ z = f128_sub(&state, x, f_0_p5); z = f128_sub(&state, z, f_0_p5); y = f128_add(&state, f128_mul(&state, f_0_p5, x), f_0_p5); } x = f128_div(&state, z, y); z = f128_mul(&state, x, x); y = f128_mul(&state, x, f128_div(&state, f128_mul(&state, z, cephes_f128_polevll( z, R, 5 )), cephes_f128_p1evll( z, S, 6 ) )); goto done; } /* logarithm using log(1+x) = x - .5x**2 + x**3 P(x)/Q(x) */ if( f128_lt(&state, x, SQRTH) ) { e -= 1; x = f128_sub(&state, cephes_f128_ldexpl( x, 1 ), one); /* 2x - 1 */ } else { x = f128_sub(&state, x, one); } z = f128_mul(&state, x, x); y = f128_mul(&state, x, f128_div(&state, f128_mul(&state, z, cephes_f128_polevll( x, P, 12 )), cephes_f128_p1evll( x, Q, 12 ))); y = f128_sub(&state, y, cephes_f128_ldexpl( z, -1 )); /* -0.5x^2 + ... */ done: /* Multiply log of fraction by log2(e) * and base 2 exponent by 1 * * ***CAUTION*** * * This sequence of operations is critical and it may * be horribly defeated by some compiler optimizers. */ z = f128_mul(&state, y, LOG2EA); z = f128_add(&state, z, f128_mul(&state, x, LOG2EA)); z = f128_add(&state, z, y); z = f128_add(&state, z, x); z = f128_add(&state, z, i32_to_f128(e)); return( z ); } ================================================ FILE: External/cephes/src/128bit/mconf.h ================================================ /* mconf.h * * Common include file for math routines * * * * SYNOPSIS: * * #include "mconf.h" * * * * DESCRIPTION: * * This file contains definitions for error codes that are * passed to the common error handling routine mtherr() * (which see). * * The file also includes a conditional assembly definition * for the type of computer arithmetic (IEEE, DEC, Motorola * IEEE, or UNKnown). * * For Digital Equipment PDP-11 and VAX computers, certain * IBM systems, and others that use numbers with a 56-bit * significand, the symbol DEC should be defined. In this * mode, most floating point constants are given as arrays * of octal integers to eliminate decimal to binary conversion * errors that might be introduced by the compiler. * * For little-endian computers, such as IBM PC, that follow the * IEEE Standard for Binary Floating Point Arithmetic (ANSI/IEEE * Std 754-1985), the symbol IBMPC should be defined. These * numbers have 53-bit significands. In this mode, constants * are provided as arrays of hexadecimal 16 bit integers. * * Big-endian IEEE format is denoted MIEEE. On some RISC * systems such as Sun SPARC, double precision constants * must be stored on 8-byte address boundaries. Since integer * arrays may be aligned differently, the MIEEE configuration * may fail on such machines. * * To accommodate other types of computer arithmetic, all * constants are also provided in a normal decimal radix * which one can hope are correctly converted to a suitable * format by the available C language compiler. To invoke * this mode, define the symbol UNK. * * An important difference among these modes is a predefined * set of machine arithmetic constants for each. The numbers * MACHEP (the machine roundoff error), MAXNUM (largest number * represented), and several other parameters are preset by * the configuration symbol. Check the file const.c to * ensure that these values are correct for your computer. * * Configurations NANS, INFINITIES, MINUSZERO, and DENORMAL * may fail on many systems. Verify that they are supposed * to work on your computer. */ /* Cephes Math Library Release 2.3: June, 1995 Copyright 1984, 1987, 1989, 1995 by Stephen L. Moshier */ /* Constant definitions for math error conditions */ #include "SoftFloat-3e/platform.h" #include "SoftFloat-3e/softfloat.h" #define DOMAIN 1 /* argument domain error */ #define SING 2 /* argument singularity */ #define OVERFLOW 3 /* overflow range error */ #define UNDERFLOW 4 /* underflow range error */ #define TLOSS 5 /* total loss of precision */ #define PLOSS 6 /* partial loss of precision */ #define EDOM 33 #define ERANGE 34 /* Complex numeral. */ typedef struct { double r; double i; } cmplx; typedef struct { float r; float i; } cmplxf; /* Long double complex numeral. */ typedef struct { float128_t r; float128_t i; } cmplxl; /* Type of computer arithmetic */ /* PDP-11, Pro350, VAX: */ /* #define DEC 1 */ /* Intel IEEE, low order words come first: */ #define IBMPC 1 /* Motorola IEEE, high order words come first * (Sun 680x0 workstation): */ /* #define MIEEE 1 */ /* UNKnown arithmetic, invokes coefficients given in * normal decimal format. Beware of range boundary * problems (MACHEP, MAXLOG, etc. in const.c) and * roundoff problems in pow.c: * (Sun SPARCstation) */ /* #define UNK 1 */ /* If you define UNK, then be sure to set BIGENDIAN properly. */ /* #define BIGENDIAN 1 */ /* Define this `volatile' if your compiler thinks * that floating point arithmetic obeys the associative * and distributive laws. It will defeat some optimizations * (but probably not enough of them). * * #define VOLATILE volatile */ #define VOLATILE /* For 12-byte long doubles on an i386, pad a 16-bit short 0 * to the end of real constants initialized by integer arrays. * * #define XPD 0, * * Otherwise, the type is 10 bytes long and XPD should be * defined blank (e.g., Microsoft C). * * #define XPD */ #define XPD 0, /* Define to support tiny denormal numbers, else undefine. */ #define DENORMAL 1 /* Define to ask for infinity support, else undefine. */ #define INFINITIES 1 /* Define to ask for support of numbers that are Not-a-Number, else undefine. This may automatically define INFINITIES in some files. */ #define NANS 1 /* Define to distinguish between -0.0 and +0.0. */ #define MINUSZERO 1 /* Define 1 for ANSI C atan2() function and ANSI prototypes for float arguments. See atan.c and clog.c. */ #define ANSIC 1 /* Variable for error reporting. See mtherr.c. */ extern int merror; /* Forward declarations */ extern float128_t F128_MINLOGL; extern float128_t F128_MAXNUML; extern float128_t F128_PIL; extern float128_t F128_PIO2L, F128_PIO4L; float128_t cephes_f128_atanl(float128_t x); #if ANSIC float128_t cephes_f128_atan2l( float128_t y, float128_t x ); #else float128_t cephes_f128_atan2l( float128_t x, float128_t y ); #endif float128_t cephes_f128_ceill(float128_t x); float128_t cephes_f128_cosl(float128_t x); float128_t cephes_f128_fabsl(float128_t x); float128_t cephes_f128_floorl(float128_t x); float128_t cephes_f128_frexpl( float128_t x, int *pw2 ); int cephes_f128_isfinitel(float128_t x); int cephes_f128_isnanl(float128_t x); float128_t cephes_f128_ldexpl( float128_t x, int pw2 ); float128_t cephes_f128_polevll( float128_t x, void *PP, int n ); float128_t cephes_f128_p1evll( float128_t x, void *PP, int n ); int cephes_f128_signbitl(float128_t x); float128_t cephes_f128_sinl(float128_t x); int mtherr( char *name, int code ); /* Public symbol declarations */ float128_t cephes_f128_log2l(float128_t x); ================================================ FILE: External/cephes/src/128bit/mtherr.c ================================================ /* mtherr.c * * Library common error handling routine * * * * SYNOPSIS: * * char *fctnam; * int code; * int mtherr(); * * mtherr( fctnam, code ); * * * * DESCRIPTION: * * This routine may be called to report one of the following * error conditions (in the include file mconf.h). * * Mnemonic Value Significance * * DOMAIN 1 argument domain error * SING 2 function singularity * OVERFLOW 3 overflow range error * UNDERFLOW 4 underflow range error * TLOSS 5 total loss of precision * PLOSS 6 partial loss of precision * EDOM 33 Unix domain error code * ERANGE 34 Unix range error code * * The default version of the file prints the function name, * passed to it by the pointer fctnam, followed by the * error condition. The display is directed to the standard * output device. The routine then returns to the calling * program. Users may wish to modify the program to abort by * calling exit() under severe error conditions such as domain * errors. * * Since all error conditions pass control to this function, * the display may be easily changed, eliminated, or directed * to an error logging device. * * SEE ALSO: * * mconf.h * */ /* Cephes Math Library Release 2.0: April, 1987 Copyright 1984, 1987 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" int merror = 0; int mtherr( char *name, int code ) { #if 0 /* Display string passed by calling program, * which is supposed to be the name of the * function in which the error occurred: */ printf( "\n%s ", name ); /* Set global error message word */ merror = code; /* Display error message defined * by the code argument. */ if( (code <= 0) || (code >= 7) ) code = 0; printf( "%s error\n", ermsg[code] ); #endif /* Return to calling * program */ return( 0 ); } ================================================ FILE: External/cephes/src/128bit/polevll.c ================================================ /* polevll.c * p1evll.c * * Evaluate polynomial * * * * SYNOPSIS: * * int N; * float128_t x, y, coef[N+1], polevl[]; * * y = polevll( x, coef, N ); * * * * DESCRIPTION: * * Evaluates polynomial of degree N: * * 2 N * y = C + C x + C x +...+ C x * 0 1 2 N * * Coefficients are stored in reverse order: * * coef[0] = C , ..., coef[N] = C . * N 0 * * The function p1evll() assumes that coef[N] = 1.0 and is * omitted from the array. Its calling arguments are * otherwise the same as polevll(). * * * SPEED: * * In the interest of speed, there are no checks for out * of bounds arithmetic. This routine is used by most of * the functions in the library. Depending on available * equipment features, the user may wish to rewrite the * program in microcode or assembly language. * */ /* Cephes Math Library Release 2.2: July, 1992 Copyright 1984, 1987, 1988, 1992 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" /* Polynomial evaluator: * P[0] x^n + P[1] x^(n-1) + ... + P[n] */ float128_t cephes_f128_polevll( float128_t x, void *PP, int n ) { struct softfloat_state state = {}; register float128_t y; float128_t *P; P = (float128_t *) PP; y = *P++; do { y = f128_add(&state, f128_mul(&state, y, x), *P++); } while( --n ); return(y); } /* Polynomial evaluator: * x^n + P[0] x^(n-1) + P[1] x^(n-2) + ... + P[n] */ float128_t cephes_f128_p1evll( float128_t x, void *PP, int n ) { struct softfloat_state state = {}; register float128_t y; float128_t *P; P = (float128_t *) PP; n -= 1; y = f128_add(&state, x, *P++); do { y = f128_add(&state, f128_mul(&state, y, x), *P++); } while( --n ); return( y ); } ================================================ FILE: External/cephes/src/128bit/sinll.c ================================================ /* sinl.c * * Circular sine, float128_t precision * * * * SYNOPSIS: * * float128_t x, y, sinl(); * * y = sinl( x ); * * * * DESCRIPTION: * * Range reduction is into intervals of pi/4. The reduction * error is nearly eliminated by contriving an extended precision * modular arithmetic. * * Two polynomial approximating functions are employed. * Between 0 and pi/4 the sine is approximated by the Cody * and Waite polynomial form * x + x^3 P(x^2) . * Between pi/4 and pi/2 the cosine is represented as * 1 - .5 x^2 + x^4 Q(x^2) . * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE +-3.6e16 100,000 2.0e-34 5.3e-35 * * ERROR MESSAGES: * * message condition value returned * sin total loss x > 2^55 0.0 * */ /* cosl.c * * Circular cosine, float128_t precision * * * * SYNOPSIS: * * float128_t x, y, cosl(); * * y = cosl( x ); * * * * DESCRIPTION: * * Range reduction is into intervals of pi/4. The reduction * error is nearly eliminated by contriving an extended precision * modular arithmetic. * * Two polynomial approximating functions are employed. * Between 0 and pi/4 the cosine is approximated by * 1 - .5 x^2 + x^4 Q(x^2) . * Between pi/4 and pi/2 the sine is represented by the Cody * and Waite polynomial form * x + x^3 P(x^2) . * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE +-3.6e16 100,000 2.0e-34 5.2e-35 * * ERROR MESSAGES: * * message condition value returned * cos total loss x > 2^55 0.0 */ /* sin.c */ /* Cephes Math Library Release 2.2: December, 1990 Copyright 1985, 1990 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" /* sin(x) = x + x^3 P(x^2) * Theoretical peak relative error = 5.6e-39 * relative peak error spread = 1.7e-9 */ static float128_t sincof[12] = { {0x07424c0cc240ddd5ULL, 0x3fab3d6c15b6d187ULL}, // 6.410290407010279602425714995528976754871E-26L, {0x0f48760e659301d0ULL, 0xbfb47619a65f0be7ULL}, // -3.868105354403065333804959405965295962871E-23L, {0xcb791f8ea7c13184ULL, 0x3fbd71b8ee9a64e1ULL}, // 1.957294039628045847156851410307133941611E-20L, {0x0b420eabbeb9d9bcULL, 0xbfc62f49b467cdf7ULL}, // -8.220635246181818130416407184286068307901E-18L, {0x4be70cee4054eef9ULL, 0x3fce952c77030ab5ULL}, // 2.811457254345322887443598804951004537784E-15L, {0xe782874b38cbd281ULL, 0xbfd6ae7f3e733b81ULL}, // -7.647163731819815869711749952353081768709E-13L, {0x97c83627668fe57cULL, 0x3fde6124613a86d0ULL}, // 1.605904383682161459812515654720205050216E-10L, {0x38fe73eef2ec94cdULL, 0xbfe5ae64567f544eULL}, // -2.505210838544171877505034150892770940116E-8L, {0x38faac1c6f6fa52aULL, 0x3fec71de3a556c73ULL}, // 2.755731922398589065255731765498970284004E-6L, {0xa01a01a019fc52ccULL, 0xbff2a01a01a01a01ULL}, // -1.984126984126984126984126984045294307281E-4L, {0x1111111111111083ULL, 0x3ff8111111111111ULL}, // 8.333333333333333333333333333333119885283E-3L, {0x5555555555555555ULL, 0xbffc555555555555ULL}, // -1.666666666666666666666666666666666647199E-1L }; /* cos(x) = 1 - .5 x^2 + x^2 (x^2 P(x^2)) * Theoretical peak relative error = 2.1e-37, * relative peak error spread = 1.4e-8 */ static float128_t coscof[11] = { {0x86919a6fdf15a4b3ULL, 0x3fafefc8801eb0a1ULL}, // 1.601961934248327059668321782499768648351E-24L, {0x902367b3281c9510ULL, 0xbfb90ce245980e11ULL}, // -8.896621117922334603659240022184527001401E-22L, {0xcf5102d043ad399aULL, 0x3fc1e542b8eb4f0dULL}, // 4.110317451243694098169570731967589555498E-19L, {0xa8272970c73ab5ffULL, 0xbfca6827863b2960ULL}, // -1.561920696747074515985647487260202922160E-16L, {0xf9016edb75d1fb52ULL, 0x3fd2ae7f3e733b51ULL}, // 4.779477332386900932514186378501779328195E-14L, {0xc3e862188c1c1f15ULL, 0xbfda93974a8c07c9ULL}, // -1.147074559772972328629102981460088437917E-11L, {0x7b517ff3abf58399ULL, 0x3fe21eed8eff8d89ULL}, // 2.087675698786809897637922200570559726116E-9L, {0xc72eef5d4453f45cULL, 0xbfe927e4fb7789f5ULL}, // -2.755731922398589065255365968070684102298E-7L, {0xa01a019fdf56450dULL, 0x3fefa01a01a01a01ULL}, // 2.480158730158730158730158440896461945271E-5L, {0x6c16c16c16b76e10ULL, 0xbff56c16c16c16c1ULL}, // -1.388888888888888888888888888765724370132E-3L, {0x55555555555553fdULL, 0x3ffa555555555555ULL}, // 4.166666666666666666666666666666459301466E-2L }; /* static float128_t DP1 = 7.853981554508209228515625E-1L; static float128_t DP2 = 7.94662735614792836713604629039764404296875E-9L; static float128_t DP3 = 3.0616169978683829430651648306875026455243736148E-17L; static float128_t lossth = 5.49755813888e11L; */ static float128_t DP1 = {0x8400000000000000ULL, 0x3ffe921fb54442d1ULL}; //7.853981633974483067550664827649598009884357452392578125E-1L; static float128_t DP2 = {0xe000000000000000ULL, 0x3fc4a62633145c06ULL}; //2.8605943630549158983813312792950660807511260829685741796657E-18L; static float128_t DP3 = {0xa67cc74020bbea64ULL, 0x3f8bcd129024e088ULL}; //2.1679525325309452561992610065108379921905808E-35L; static const float128_t lossth = {0x0000000000000000ULL, 0x4036000000000000ULL}; // 3.6028797018963968E16L; /* 2^55 */ static const float128_t zero = {0, 0}; static const float128_t one = {0, 0x3fff000000000000ULL}; float128_t cephes_f128_sinl(float128_t x) { struct softfloat_state state = {}; float128_t y, z, zz; int j, sign; /* make argument positive but save the sign */ sign = 1; if( f128_lt(&state, x, zero) ) { x = f128_complement_sign(x); sign = -1; } if( f128_lt(&state, lossth, x)) { mtherr( "sinl", TLOSS ); return zero; } y = cephes_f128_floorl( f128_div(&state, x, F128_PIO4L) ); /* integer part of x/PIO4 */ /* strip high bits of integer part to prevent integer overflow */ z = cephes_f128_ldexpl( y, -4 ); z = cephes_f128_floorl(z); /* integer part of y/8 */ z = f128_sub(&state, y, cephes_f128_ldexpl( z, 4 )); /* y - 16 * (y/16) */ j = f128_to_i32(&state, z, softfloat_round_near_even, true); /* convert to integer for tests on the phase angle */ /* map zeros to origin */ if( j & 1 ) { j += 1; y = f128_add(&state, y, one); } j = j & 07; /* octant modulo 360 degrees */ /* reflect in x axis */ if( j > 3) { sign = -sign; j -= 4; } /* Extended precision modular arithmetic */ // z = ((x - y * DP1) - y * DP2) - y * DP3; { float128_t tmp1 = f128_mul(&state, y, DP1); float128_t tmp2 = f128_mul(&state, y, DP2); float128_t tmp3 = f128_mul(&state, y, DP3); float128_t tmp4 = f128_sub(&state, x, tmp1); float128_t tmp5 = f128_sub(&state, tmp4, tmp2); z = f128_sub(&state, tmp5, tmp3); } z = f128_sub(&state, f128_sub(&state, f128_sub(&state, x, f128_mul(&state, y, DP1)), f128_mul(&state, y, DP2)), f128_mul(&state, y, DP3)); zz = f128_mul(&state, z, z); if( (j==1) || (j==2) ) { // y = 1.0L - ldexpl(zz,-1) + zz * zz * polevll( zz, coscof, 10 ); float128_t tmp1 = f128_mul(&state, zz, zz); float128_t tmp2 = f128_mul(&state, tmp1, cephes_f128_polevll( zz, coscof, 10 )); float128_t tmp3 = f128_sub(&state, one, cephes_f128_ldexpl(zz,-1)); y = f128_add(&state, tmp3, tmp2); } else { // y = z + z * (zz * polevll( zz, sincof, 11 )); float128_t tmp1 = f128_mul(&state, zz, cephes_f128_polevll( zz, sincof, 11 )); float128_t tmp2 = f128_mul(&state, z, tmp1); y = f128_add(&state, z, tmp2); } if(sign < 0) y = f128_complement_sign(y); return(y); } float128_t cephes_f128_cosl(float128_t x) { struct softfloat_state state = {}; float128_t y, z, zz; long i; int j, sign; /* make argument positive */ sign = 1; if( f128_lt(&state, x, zero) ) x = f128_complement_sign(x); if( f128_lt(&state, lossth, x)) { mtherr( "cosl", TLOSS ); return zero; } y = cephes_f128_floorl( f128_div(&state, x, F128_PIO4L)); z = cephes_f128_ldexpl( y, -4 ); z = cephes_f128_floorl(z); /* integer part of y/8 */ z = f128_sub(&state, y, cephes_f128_ldexpl( z, 4 )); /* y - 16 * (y/16) */ /* integer and fractional part modulo one octant */ i = f128_to_i32(&state, z, softfloat_round_near_even, true); if( i & 1 ) /* map zeros to origin */ { i += 1; y = f128_add(&state, y, one); } j = i & 07; if( j > 3) { j -=4; sign = -sign; } if( j > 1 ) sign = -sign; /* Extended precision modular arithmetic */ // z = ((x - y * DP1) - y * DP2) - y * DP3; { float128_t tmp1 = f128_mul(&state, y, DP1); float128_t tmp2 = f128_mul(&state, y, DP2); float128_t tmp3 = f128_mul(&state, y, DP3); float128_t tmp4 = f128_sub(&state, x, tmp1); float128_t tmp5 = f128_sub(&state, tmp4, tmp2); z = f128_sub(&state, tmp5, tmp3); } zz = f128_mul(&state, z, z); if( (j==1) || (j==2) ) { // y = z + z * (zz * polevll( zz, sincof, 11 )); float128_t tmp1 = f128_mul(&state, zz, cephes_f128_polevll( zz, sincof, 11 )); float128_t tmp2 = f128_mul(&state, z, tmp1); y = f128_add(&state, z, tmp2); } else { // y = 1.0L - ldexpl(zz,-1) + zz * zz * polevll( zz, coscof, 10 ); float128_t tmp1 = f128_mul(&state, zz, zz); float128_t tmp2 = f128_mul(&state, tmp1, cephes_f128_polevll( zz, coscof, 10 )); float128_t tmp3 = f128_sub(&state, one, cephes_f128_ldexpl(zz,-1)); y = f128_add(&state, tmp3, tmp2); } if(sign < 0) y = f128_complement_sign(y); return(y); } ================================================ FILE: External/cephes/src/128bit/tanll.c ================================================ /* tanl.c * * Circular tangent, 128-bit float128_t precision * * * * SYNOPSIS: * * float128_t x, y, tanl(); * * y = tanl( x ); * * * * DESCRIPTION: * * Returns the circular tangent of the radian argument x. * * Range reduction is modulo pi/4. A rational function * x + x**3 P(x**2)/Q(x**2) * is employed in the basic interval [0, pi/4]. * * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE +-3.6e16 100,000 3.0e-34 7.2e-35 * * ERROR MESSAGES: * * message condition value returned * tan total loss x > 2^55 0.0 * */ /* cotl.c * * Circular cotangent, float128_t precision * * * * SYNOPSIS: * * float128_t x, y, cotl(); * * y = cotl( x ); * * * * DESCRIPTION: * * Returns the circular cotangent of the radian argument x. * * Range reduction is modulo pi/4. A rational function * x + x**3 P(x**2)/Q(x**2) * is employed in the basic interval [0, pi/4]. * * * * ACCURACY: * * Relative error: * arithmetic domain # trials peak rms * IEEE +-3.6e16 100,000 2.9e-34 7.2e-35 * * * ERROR MESSAGES: * * message condition value returned * cot total loss x > 2^55 0.0 * cot singularity x = 0 MAXNUM * */ /* Cephes Math Library Release 2.2: December, 1990 Copyright 1984, 1990 by Stephen L. Moshier Direct inquiries to 30 Frost Street, Cambridge, MA 02140 */ #include "mconf.h" /* tan(x) = x + x^3 P(x^2) * 0 <= |x| <= pi/4 * Theoretical peak relative error = 4.3e-38 * relative peak error spread = 6.1e-11 */ static float128_t P[6] = { {0x09978dc7ae2a2f4bULL, 0xbffefa5d486820e2ULL}, // -9.889929415807650724957118893791829849557E-1L, {0x52a017b1ca7c4799ULL, 0x40093e130edd1294ULL}, // 1.272297782199996882828849455156962260810E3L, {0x8857161b398b3c53ULL, 0xc0119f024bdcc6c3ULL}, // -4.249691853501233575668486667664718192660E5L, {0xcc299261a6616b83ULL, 0x401889b0ed404622ULL}, // 5.160188250214037865511600561074819366815E7L, {0x37d9311de4cdbf04ULL, 0xc01e1304fe4d6331ULL}, // -2.307030822693734879744223131873392503321E9L, {0x6e9f0eac6b638a9aULL, 0x4021ada98af62f83ULL}, // 2.883414728874239697964612246732416606301E10L }; static float128_t Q[6] = { /* 1.000000000000000000000000000000000000000E0L, */ {0xeb01d728f7d3bb04ULL, 0xc009494f98d3c1caULL}, // -1.317243702830553658702531997959756728291E3L, {0xcdd312b4ac46a6cdULL, 0x4011ba538d331a98ULL}, // 4.529422062441341616231663543669583527923E5L, {0x2a1a6372eebd73a1ULL, 0xc018b57281a9f10bULL}, // -5.733709132766856723608447733926138506824E7L, {0x3e9defb0e348fbe5ULL, 0x401e48d6025d9b41ULL}, // 2.758476078803232151774723646710890525496E9L, {0x7cd82869db5580d1ULL, 0xc022355d0fdbd24eULL}, // -4.152206921457208101480801635640958361612E10L, {0x92f74b01508aa7f3ULL, 0x4023423f2838a3a2ULL}, // 8.650244186622719093893836740197250197602E10L }; static float128_t DP1 = {0x8400000000000000ULL, 0x3ffe921fb54442d1ULL}; //7.853981633974483067550664827649598009884357452392578125E-1L; static float128_t DP2 = {0xe000000000000000ULL, 0x3fc4a62633145c06ULL}; //2.8605943630549158983813312792950660807511260829685741796657E-18L; static float128_t DP3 = {0xa67cc74020bbea64ULL, 0x3f8bcd129024e088ULL}; // 2.1679525325309452561992610065108379921905808E-35L; static const float128_t lossth = {0x0000000000000000ULL, 0x4036000000000000ULL}; // 3.6028797018963968E16L; /* 2^55 */ static const float128_t zero = {0, 0}; static const float128_t one = {0, 0x3fff000000000000ULL}; static const float128_t neg_one = {0, 0xbfff000000000000ULL}; static const float128_t max_quad = {0x35d511e976394d7aULL, 0x3fbc79ca10c92422ULL}; static float128_t tancotl( struct softfloat_state *state, float128_t xx, int cotflg ); float128_t cephes_f128_tanl(float128_t x) { struct softfloat_state state = {}; return( tancotl(&state, x,0) ); } float128_t cotl(float128_t x) { struct softfloat_state state = {}; if( f128_eq(&state, x, zero) ) { mtherr( "cotl", SING ); return( F128_MAXNUML ); } return( tancotl(&state, x,1) ); } static float128_t tancotl( struct softfloat_state *state, float128_t xx, int cotflg ) { float128_t x, y, z, zz; int j, sign; /* make argument positive but save the sign */ // if (xx < 0.0L) if( f128_lt(state, xx, zero) ) { x = f128_sub(state, zero, xx); sign = -1; } else { x = xx; sign = 1; } //if (x > lossth) if (f128_lt(state, lossth, x)) { if( cotflg ) mtherr( "cotl", TLOSS ); else mtherr( "tanl", TLOSS ); return zero; } /* compute x mod PIO4 */ y = cephes_f128_floorl( f128_div(state, x, F128_PIO4L)); /* strip high bits of integer part */ z = cephes_f128_ldexpl( y, -4 ); z = cephes_f128_floorl(z); /* integer part of y/16 */ z = f128_sub(state, y, cephes_f128_ldexpl( z, 4 )); /* y - 16 * (y/16) */ /* integer and fractional part modulo one octant */ j = f128_to_i32(state, z, softfloat_round_near_even, true); /* map zeros and singularities to origin */ if( j & 1 ) { j += 1; y = f128_add(state, y, one); } z = f128_sub(state, f128_sub(state, f128_sub(state, x, f128_mul(state, y, DP1)), f128_mul(state, y, DP2)), f128_mul(state, y, DP3)); zz = f128_mul(state, z, z); // if( zz > 1.0e-20L ) if (f128_lt(state, max_quad, zz)) { y = f128_add(state, z, f128_mul(state, z, f128_div(state, f128_mul(state, zz, cephes_f128_polevll( zz, P, 5 )), cephes_f128_p1evll(zz, Q, 6)))); } else { y = z; } if( j & 2 ) { if( cotflg ) y = f128_complement_sign(y); else y = f128_div(state, neg_one, y); } else { if( cotflg ) y = f128_div(state, one, y); } if( sign < 0 ) y = f128_complement_sign(y); return( y ); } ================================================ FILE: External/code-format-helper/code-format-helper.py ================================================ #!/usr/bin/env python3 # # ====- code-format-helper, runs code formatters from the ci or in a hook --*- python -*--==# # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # ==--------------------------------------------------------------------------------------==# import argparse import os import subprocess import sys from typing import List, Optional """ This script is run by GitHub actions to ensure that the code in PR's conform to the coding style of LLVM. It can also be installed as a pre-commit git hook to check the coding style before submitting it. The canonical source of this script is in the LLVM source tree under llvm/utils/git. For C/C++ code it uses clang-format. You can learn more about the LLVM coding style on llvm.org: https://llvm.org/docs/CodingStandards.html You can install this script as a git hook by symlinking it to the .git/hooks directory: ln -s $(pwd)/llvm/utils/git/code-format-helper.py .git/hooks/pre-commit You can control the exact path to clang-format with the following environment variable: $CLANG_FORMAT_PATH. """ class FormatArgs: start_rev: str = None end_rev: str = None repo: str = None changed_files: List[str] = [] token: str = None verbose: bool = True issue_number: int = 0 write_comment_to_file: str = None def __init__(self, args: argparse.Namespace = None) -> None: if not args is None: self.start_rev = args.start_rev self.end_rev = args.end_rev self.repo = args.repo self.token = args.token self.changed_files = args.changed_files self.issue_number = args.issue_number self.write_comment_to_file = args.write_comment_to_file class FormatHelper: COMMENT_TAG = "" name: str friendly_name: str comment: dict = None @property def comment_tag(self) -> str: return self.COMMENT_TAG.replace("fmt", self.name) @property def instructions(self) -> str: raise NotImplementedError() def has_tool(self) -> bool: raise NotImplementedError() def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str]: raise NotImplementedError() def pr_comment_text_for_diff(self, diff: str) -> str: return f""" :warning: {self.friendly_name}, {self.name} found issues in your code. :warning:
You can test this locally with the following command: ``````````bash {self.instructions} ``````````
View the diff from {self.name} here. ``````````diff {diff} ``````````
""" # TODO: any type should be replaced with the correct github type, but it requires refactoring to # not require the github module to be installed everywhere. def find_comment(self, pr: any) -> any: for comment in pr.as_issue().get_comments(): if self.comment_tag in comment.body: return comment return None def update_pr(self, comment_text: str, args: FormatArgs, create_new: bool) -> None: import github from github import IssueComment, PullRequest repo = github.Github(args.token).get_repo(args.repo) pr = repo.get_issue(args.issue_number).as_pull_request() comment_text = self.comment_tag + "\n\n" + comment_text existing_comment = self.find_comment(pr) if args.write_comment_to_file: if create_new or existing_comment: self.comment = {"body": comment_text} if existing_comment: self.comment["id"] = existing_comment.id return if existing_comment: existing_comment.edit(comment_text) elif create_new: pr.as_issue().create_comment(comment_text) def run(self, changed_files: List[str], args: FormatArgs) -> bool: changed_files = [arg for arg in changed_files if "third-party" not in arg] diff = self.format_run(changed_files, args) should_update_gh = args.token is not None and args.repo is not None if diff is None: if should_update_gh: comment_text = ( ":white_check_mark: With the latest revision " f"this PR passed the {self.friendly_name}." ) self.update_pr(comment_text, args, create_new=False) return True elif len(diff) > 0: if should_update_gh: comment_text = self.pr_comment_text_for_diff(diff) self.update_pr(comment_text, args, create_new=True) else: print( f"Warning: {self.friendly_name}, {self.name} detected " "some issues with your code formatting..." ) return False else: # The formatter failed but didn't output a diff (e.g. some sort of # infrastructure failure). comment_text = ( f":warning: The {self.friendly_name} failed without printing " "a diff. Check the logs for stderr output. :warning:" ) self.update_pr(comment_text, args, create_new=False) return False class ClangFormatHelper(FormatHelper): name = "git-clang-format" friendly_name = "C/C++ code formatter" @property def instructions(self) -> str: return " ".join(self.cf_cmd) def should_include_extensionless_file(self, path: str) -> bool: return path.startswith("libcxx/include") def filter_changed_files(self, changed_files: List[str]) -> List[str]: filtered_files = [] for path in changed_files: _, ext = os.path.splitext(path) if ext in (".cpp", ".c", ".h", ".hpp", ".hxx", ".cxx", ".inc", ".cppm"): filtered_files.append(path) elif ext == "" and self.should_include_extensionless_file(path): filtered_files.append(path) return filtered_files @property def clang_fmt_path(self) -> str: if "CLANG_FORMAT_PATH" in os.environ: return os.environ["CLANG_FORMAT_PATH"] return "git-clang-format-19" def has_tool(self) -> bool: cmd = [self.clang_fmt_path, "-h"] proc = None try: proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: return False return proc.returncode == 0 def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str]: cpp_files = self.filter_changed_files(changed_files) if not cpp_files: return None cf_cmd = [ self.clang_fmt_path, "--binary=clang-format-19", "--diff", ] if args.start_rev and args.end_rev: cf_cmd.append(args.start_rev) cf_cmd.append(args.end_rev) cf_cmd.append("--") cf_cmd += cpp_files if args.verbose: print(f"Running: {' '.join(cf_cmd)}") self.cf_cmd = cf_cmd proc = subprocess.run(cf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sys.stdout.write(proc.stderr.decode("utf-8")) if proc.returncode != 0: # formatting needed, or the command otherwise failed if args.verbose: print(f"error: {self.name} exited with code {proc.returncode}") # Print the diff in the log so that it is viewable there print(proc.stdout.decode("utf-8")) return proc.stdout.decode("utf-8") else: return None ALL_FORMATTERS = [ClangFormatHelper()] def hook_main(): # fill out args args = FormatArgs() args.verbose = False # find the changed files cmd = ["git", "diff", "--cached", "--name-only", "--diff-filter=d"] proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = proc.stdout.decode("utf-8") for line in output.splitlines(): args.changed_files.append(line) failed_fmts = [] for fmt in ALL_FORMATTERS: if fmt.has_tool(): if not fmt.run(args.changed_files, args): failed_fmts.append(fmt.name) if fmt.comment: comments.append(fmt.comment) else: print(f"Couldn't find {fmt.name}, can't check " + fmt.friendly_name.lower()) if len(failed_fmts) > 0: sys.exit(1) sys.exit(0) if __name__ == "__main__": script_path = os.path.abspath(__file__) if ".git/hooks" in script_path: hook_main() sys.exit(0) parser = argparse.ArgumentParser() parser.add_argument( "--token", type=str, required=False, help="GitHub authentication token" ) parser.add_argument( "--repo", type=str, default=os.getenv("GITHUB_REPOSITORY", "llvm/llvm-project"), help="The GitHub repository that we are working with in the form of / (e.g. llvm/llvm-project)", ) parser.add_argument("--issue-number", type=int, required=True) parser.add_argument( "--start-rev", type=str, required=True, help="Compute changes from this revision.", ) parser.add_argument( "--end-rev", type=str, required=True, help="Compute changes to this revision" ) parser.add_argument( "--changed-files", type=str, help="Comma separated list of files that has been changed", ) parser.add_argument( "--write-comment-to-file", type=str, help="Don't post comments on the PR, instead write the comments and metadata a file", ) args = FormatArgs(parser.parse_args()) changed_files = [] if args.changed_files: changed_files = args.changed_files.split(",") failed_formatters = [] comments = [] for fmt in ALL_FORMATTERS: if not fmt.run(changed_files, args): failed_formatters.append(fmt.name) if fmt.comment: comments.append(fmt.comment) if len(comments): with open(args.write_comment_to_file, "w") as f: import json json.dump(comments, f) if len(failed_formatters) > 0: print(f"error: some formatters failed: {' '.join(failed_formatters)}") sys.exit(1) ================================================ FILE: External/code-format-helper/requirements_formatting.txt ================================================ # # This file is autogenerated by pip-compile with Python 3.13 # by the following command: # # pip-compile --generate-hashes --output-file=requirements_formatting.txt --strip-extras requirements_formatting.txt.in # black==26.3.1 \ --hash=sha256:0126ae5b7c09957da2bdbd91a9ba1207453feada9e9fe51992848658c6c8e01c \ --hash=sha256:0f76ff19ec5297dd8e66eb64deda23631e642c9393ab592826fd4bdc97a4bce7 \ --hash=sha256:28ef38aee69e4b12fda8dba75e21f9b4f979b490c8ac0baa7cb505369ac9e1ff \ --hash=sha256:2bd5aa94fc267d38bb21a70d7410a89f1a1d318841855f698746f8e7f51acd1b \ --hash=sha256:2c50f5063a9641c7eed7795014ba37b0f5fa227f3d408b968936e24bc0566b07 \ --hash=sha256:2d6bfaf7fd0993b420bed691f20f9492d53ce9a2bcccea4b797d34e947318a78 \ --hash=sha256:41cd2012d35b47d589cb8a16faf8a32ef7a336f56356babd9fcf70939ad1897f \ --hash=sha256:474c27574d6d7037c1bc875a81d9be0a9a4f9ee95e62800dab3cfaadbf75acd5 \ --hash=sha256:5602bdb96d52d2d0672f24f6ffe5218795736dd34807fd0fd55ccd6bf206168b \ --hash=sha256:5e9d0d86df21f2e1677cc4bd090cd0e446278bcbbe49bf3659c308c3e402843e \ --hash=sha256:5ed0ca58586c8d9a487352a96b15272b7fa55d139fc8496b519e78023a8dab0a \ --hash=sha256:6c54a4a82e291a1fee5137371ab488866b7c86a3305af4026bdd4dc78642e1ac \ --hash=sha256:6e131579c243c98f35bce64a7e08e87fb2d610544754675d4a0e73a070a5aa3a \ --hash=sha256:855822d90f884905362f602880ed8b5df1b7e3ee7d0db2502d4388a954cc8c54 \ --hash=sha256:86a8b5035fce64f5dcd1b794cf8ec4d31fe458cf6ce3986a30deb434df82a1d2 \ --hash=sha256:8a33d657f3276328ce00e4d37fe70361e1ec7614da5d7b6e78de5426cb56332f \ --hash=sha256:92c0ec1f2cc149551a2b7b47efc32c866406b6891b0ee4625e95967c8f4acfb1 \ --hash=sha256:9a5e9f45e5d5e1c5b5c29b3bd4265dcc90e8b92cf4534520896ed77f791f4da5 \ --hash=sha256:afc622538b430aa4c8c853f7f63bc582b3b8030fd8c80b70fb5fa5b834e575c2 \ --hash=sha256:b07fc0dab849d24a80a29cfab8d8a19187d1c4685d8a5e6385a5ce323c1f015f \ --hash=sha256:b5e6f89631eb88a7302d416594a32faeee9fb8fb848290da9d0a5f2903519fc1 \ --hash=sha256:bf9bf162ed91a26f1adba8efda0b573bc6924ec1408a52cc6f82cb73ec2b142c \ --hash=sha256:c7e72339f841b5a237ff14f7d3880ddd0fc7f98a1199e8c4327f9a4f478c1839 \ --hash=sha256:ddb113db38838eb9f043623ba274cfaf7d51d5b0c22ecb30afe58b1bb8322983 \ --hash=sha256:dfdd51fc3e64ea4f35873d1b3fb25326773d55d2329ff8449139ebaad7357efb \ --hash=sha256:f1cd08e99d2f9317292a311dfe578fd2a24b15dbce97792f9c4d752275c1fa56 \ --hash=sha256:f89f2ab047c76a9c03f78d0d66ca519e389519902fa27e7a91117ef7611c0568 # via # -r requirements_formatting.txt.in # darker certifi==2025.7.14 \ --hash=sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2 \ --hash=sha256:8ea99dbdfaaf2ba2f9bac77b9249ef62ec5218e7c2b2e903378ed5fccf765995 # via # -r requirements_formatting.txt.in # requests cffi==2.0.0 \ --hash=sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb \ --hash=sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b \ --hash=sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f \ --hash=sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9 \ --hash=sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44 \ --hash=sha256:0f6084a0ea23d05d20c3edcda20c3d006f9b6f3fefeac38f59262e10cef47ee2 \ --hash=sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c \ --hash=sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75 \ --hash=sha256:1cd13c99ce269b3ed80b417dcd591415d3372bcac067009b6e0f59c7d4015e65 \ --hash=sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e \ --hash=sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a \ --hash=sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e \ --hash=sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25 \ --hash=sha256:2081580ebb843f759b9f617314a24ed5738c51d2aee65d31e02f6f7a2b97707a \ --hash=sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe \ --hash=sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b \ --hash=sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91 \ --hash=sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592 \ --hash=sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187 \ --hash=sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c \ --hash=sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1 \ --hash=sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94 \ --hash=sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba \ --hash=sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb \ --hash=sha256:3f4d46d8b35698056ec29bca21546e1551a205058ae1a181d871e278b0b28165 \ --hash=sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529 \ --hash=sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca \ --hash=sha256:4647afc2f90d1ddd33441e5b0e85b16b12ddec4fca55f0d9671fef036ecca27c \ --hash=sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6 \ --hash=sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c \ --hash=sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0 \ --hash=sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743 \ --hash=sha256:61d028e90346df14fedc3d1e5441df818d095f3b87d286825dfcbd6459b7ef63 \ --hash=sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5 \ --hash=sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5 \ --hash=sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4 \ --hash=sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d \ --hash=sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b \ --hash=sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93 \ --hash=sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205 \ --hash=sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27 \ --hash=sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512 \ --hash=sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d \ --hash=sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c \ --hash=sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037 \ --hash=sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26 \ --hash=sha256:89472c9762729b5ae1ad974b777416bfda4ac5642423fa93bd57a09204712322 \ --hash=sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb \ --hash=sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c \ --hash=sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8 \ --hash=sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4 \ --hash=sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414 \ --hash=sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9 \ --hash=sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664 \ --hash=sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9 \ --hash=sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775 \ --hash=sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739 \ --hash=sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc \ --hash=sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062 \ --hash=sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe \ --hash=sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9 \ --hash=sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92 \ --hash=sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5 \ --hash=sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13 \ --hash=sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d \ --hash=sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26 \ --hash=sha256:cb527a79772e5ef98fb1d700678fe031e353e765d1ca2d409c92263c6d43e09f \ --hash=sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495 \ --hash=sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b \ --hash=sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6 \ --hash=sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c \ --hash=sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef \ --hash=sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5 \ --hash=sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18 \ --hash=sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad \ --hash=sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3 \ --hash=sha256:de8dad4425a6ca6e4e5e297b27b5c824ecc7581910bf9aee86cb6835e6812aa7 \ --hash=sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5 \ --hash=sha256:e6e73b9e02893c764e7e8d5bb5ce277f1a009cd5243f8228f75f842bf937c534 \ --hash=sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49 \ --hash=sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2 \ --hash=sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5 \ --hash=sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453 \ --hash=sha256:fe562eb1a64e67dd297ccc4f5addea2501664954f2692b69a76449ec7913ecbf # via # cryptography # pynacl charset-normalizer==3.2.0 \ --hash=sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96 \ --hash=sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c \ --hash=sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710 \ --hash=sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706 \ --hash=sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020 \ --hash=sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252 \ --hash=sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad \ --hash=sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329 \ --hash=sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a \ --hash=sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f \ --hash=sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6 \ --hash=sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4 \ --hash=sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a \ --hash=sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46 \ --hash=sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2 \ --hash=sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23 \ --hash=sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace \ --hash=sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd \ --hash=sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982 \ --hash=sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10 \ --hash=sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2 \ --hash=sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea \ --hash=sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09 \ --hash=sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5 \ --hash=sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149 \ --hash=sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489 \ --hash=sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9 \ --hash=sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80 \ --hash=sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592 \ --hash=sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3 \ --hash=sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6 \ --hash=sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed \ --hash=sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c \ --hash=sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200 \ --hash=sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a \ --hash=sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e \ --hash=sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d \ --hash=sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6 \ --hash=sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623 \ --hash=sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669 \ --hash=sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3 \ --hash=sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa \ --hash=sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9 \ --hash=sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2 \ --hash=sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f \ --hash=sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1 \ --hash=sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4 \ --hash=sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a \ --hash=sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8 \ --hash=sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3 \ --hash=sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029 \ --hash=sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f \ --hash=sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959 \ --hash=sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22 \ --hash=sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7 \ --hash=sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952 \ --hash=sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346 \ --hash=sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e \ --hash=sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d \ --hash=sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299 \ --hash=sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd \ --hash=sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a \ --hash=sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3 \ --hash=sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037 \ --hash=sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94 \ --hash=sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c \ --hash=sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858 \ --hash=sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a \ --hash=sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449 \ --hash=sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c \ --hash=sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918 \ --hash=sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1 \ --hash=sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c \ --hash=sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac \ --hash=sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa # via requests click==8.1.7 \ --hash=sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28 \ --hash=sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de # via black cryptography==46.0.5 \ --hash=sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72 \ --hash=sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235 \ --hash=sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9 \ --hash=sha256:2ae6971afd6246710480e3f15824ed3029a60fc16991db250034efd0b9fb4356 \ --hash=sha256:2b7a67c9cd56372f3249b39699f2ad479f6991e62ea15800973b956f4b73e257 \ --hash=sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad \ --hash=sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4 \ --hash=sha256:3b4995dc971c9fb83c25aa44cf45f02ba86f71ee600d81091c2f0cbae116b06c \ --hash=sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614 \ --hash=sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed \ --hash=sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31 \ --hash=sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229 \ --hash=sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0 \ --hash=sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731 \ --hash=sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b \ --hash=sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4 \ --hash=sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4 \ --hash=sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263 \ --hash=sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595 \ --hash=sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1 \ --hash=sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678 \ --hash=sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48 \ --hash=sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76 \ --hash=sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0 \ --hash=sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18 \ --hash=sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d \ --hash=sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d \ --hash=sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1 \ --hash=sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981 \ --hash=sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7 \ --hash=sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82 \ --hash=sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2 \ --hash=sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4 \ --hash=sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663 \ --hash=sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c \ --hash=sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d \ --hash=sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a \ --hash=sha256:bc84e875994c3b445871ea7181d424588171efec3e185dced958dad9e001950a \ --hash=sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d \ --hash=sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b \ --hash=sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a \ --hash=sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826 \ --hash=sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee \ --hash=sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9 \ --hash=sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648 \ --hash=sha256:d861ee9e76ace6cf36a6a89b959ec08e7bc2493ee39d07ffe5acb23ef46d27da \ --hash=sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2 \ --hash=sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2 \ --hash=sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87 # via # -r requirements_formatting.txt.in # pyjwt darker==2.1.1 \ --hash=sha256:a6e6a682c0604e76fe9aec7650e96a944f517563c69b28fcc076db9d957d98ea \ --hash=sha256:ead701414c45359fc0312bc285614d3285fc135476d43f3bc08d989ee19d9020 # via -r requirements_formatting.txt.in darkgraylib==1.2.1 \ --hash=sha256:60c59de69842367ce0c78c32c451fa8e9d29500e681312d9864a7416bcdb7792 \ --hash=sha256:a5dd6a2015a470d9047278cdd01a91ccb1d746675f8fd4562b3b5f6b8cbda930 # via # darker # graylint deprecated==1.2.14 \ --hash=sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c \ --hash=sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3 # via pygithub graylint==1.1.1 \ --hash=sha256:0fd8e02972ca03d0ef2bf0adea76b5343efcd492d7afb5f658f3e3a724f55a36 \ --hash=sha256:b7e0eab6c159684dbf5ef84e942c3340f6a6549b02a3d11b1a1763cc4f8f0593 # via darker idna==3.10 \ --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 # via # -r requirements_formatting.txt.in # requests mypy-extensions==1.0.0 \ --hash=sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d \ --hash=sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782 # via black packaging==23.1 \ --hash=sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61 \ --hash=sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f # via black pathspec==1.0.4 \ --hash=sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645 \ --hash=sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723 # via black platformdirs==3.10.0 \ --hash=sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d \ --hash=sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d # via black pycparser==2.21 \ --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 # via cffi pygithub==2.6.1 \ --hash=sha256:6f2fa6d076ccae475f9fc392cc6cdbd54db985d4f69b8833a28397de75ed6ca3 \ --hash=sha256:b5c035392991cca63959e9453286b41b54d83bf2de2daa7d7ff7e4312cebf3bf # via -r requirements_formatting.txt.in pyjwt==2.12.1 \ --hash=sha256:28ca37c070cad8ba8cd9790cd940535d40274d22f80ab87f3ac6a713e6e8454c \ --hash=sha256:c74a7a2adf861c04d002db713dd85f84beb242228e671280bf709d765b03672b # via # -r requirements_formatting.txt.in # pygithub pynacl==1.6.2 \ --hash=sha256:018494d6d696ae03c7e656e5e74cdfd8ea1326962cc401bcf018f1ed8436811c \ --hash=sha256:04316d1fc625d860b6c162fff704eb8426b1a8bcd3abacea11142cbd99a6b574 \ --hash=sha256:22de65bb9010a725b0dac248f353bb072969c94fa8d6b1f34b87d7953cf7bbe4 \ --hash=sha256:26bfcd00dcf2cf160f122186af731ae30ab120c18e8375684ec2670dccd28130 \ --hash=sha256:2fef529ef3ee487ad8113d287a593fa26f48ee3620d92ecc6f1d09ea38e0709b \ --hash=sha256:320ef68a41c87547c91a8b58903c9caa641ab01e8512ce291085b5fe2fcb7590 \ --hash=sha256:3bffb6d0f6becacb6526f8f42adfb5efb26337056ee0831fb9a7044d1a964444 \ --hash=sha256:44081faff368d6c5553ccf55322ef2819abb40e25afaec7e740f159f74813634 \ --hash=sha256:46065496ab748469cdd999246d17e301b2c24ae2fdf739132e580a0e94c94a87 \ --hash=sha256:5811c72b473b2f38f7e2a3dc4f8642e3a3e9b5e7317266e4ced1fba85cae41aa \ --hash=sha256:622d7b07cc5c02c666795792931b50c91f3ce3c2649762efb1ef0d5684c81594 \ --hash=sha256:62985f233210dee6548c223301b6c25440852e13d59a8b81490203c3227c5ba0 \ --hash=sha256:68be3a09455743ff9505491220b64440ced8973fe930f270c8e07ccfa25b1f9e \ --hash=sha256:834a43af110f743a754448463e8fd61259cd4ab5bbedcf70f9dabad1d28a394c \ --hash=sha256:8845c0631c0be43abdd865511c41eab235e0be69c81dc66a50911594198679b0 \ --hash=sha256:8a66d6fb6ae7661c58995f9c6435bda2b1e68b54b598a6a10247bfcdadac996c \ --hash=sha256:8b097553b380236d51ed11356c953bf8ce36a29a3e596e934ecabe76c985a577 \ --hash=sha256:a84bf1c20339d06dc0c85d9aea9637a24f718f375d861b2668b2f9f96fa51145 \ --hash=sha256:a9f9932d8d2811ce1a8ffa79dcbdf3970e7355b5c8eb0c1a881a57e7f7d96e88 \ --hash=sha256:bc4a36b28dd72fb4845e5d8f9760610588a96d5a51f01d84d8c6ff9849968c14 \ --hash=sha256:c8a231e36ec2cab018c4ad4358c386e36eede0319a0c41fed24f840b1dac59f6 \ --hash=sha256:c949ea47e4206af7c8f604b8278093b674f7c79ed0d4719cc836902bf4517465 \ --hash=sha256:d071c6a9a4c94d79eb665db4ce5cedc537faf74f2355e4d502591d850d3913c0 \ --hash=sha256:d29bfe37e20e015a7d8b23cfc8bd6aa7909c92a1b8f41ee416bbb3e79ef182b2 \ --hash=sha256:fe9847ca47d287af41e82be1dd5e23023d3c31a951da134121ab02e42ac218c9 # via # -r requirements_formatting.txt.in # pygithub pytokens==0.4.1 \ --hash=sha256:0fc71786e629cef478cbf29d7ea1923299181d0699dbe7c3c0f4a583811d9fc1 \ --hash=sha256:11edda0942da80ff58c4408407616a310adecae1ddd22eef8c692fe266fa5009 \ --hash=sha256:140709331e846b728475786df8aeb27d24f48cbcf7bcd449f8de75cae7a45083 \ --hash=sha256:24afde1f53d95348b5a0eb19488661147285ca4dd7ed752bbc3e1c6242a304d1 \ --hash=sha256:26cef14744a8385f35d0e095dc8b3a7583f6c953c2e3d269c7f82484bf5ad2de \ --hash=sha256:27b83ad28825978742beef057bfe406ad6ed524b2d28c252c5de7b4a6dd48fa2 \ --hash=sha256:292052fe80923aae2260c073f822ceba21f3872ced9a68bb7953b348e561179a \ --hash=sha256:29d1d8fb1030af4d231789959f21821ab6325e463f0503a61d204343c9b355d1 \ --hash=sha256:2a44ed93ea23415c54f3face3b65ef2b844d96aeb3455b8a69b3df6beab6acc5 \ --hash=sha256:30f51edd9bb7f85c748979384165601d028b84f7bd13fe14d3e065304093916a \ --hash=sha256:34bcc734bd2f2d5fe3b34e7b3c0116bfb2397f2d9666139988e7a3eb5f7400e3 \ --hash=sha256:3ad72b851e781478366288743198101e5eb34a414f1d5627cdd585ca3b25f1db \ --hash=sha256:3f901fe783e06e48e8cbdc82d631fca8f118333798193e026a50ce1b3757ea68 \ --hash=sha256:42f144f3aafa5d92bad964d471a581651e28b24434d184871bd02e3a0d956037 \ --hash=sha256:4a14d5f5fc78ce85e426aa159489e2d5961acf0e47575e08f35584009178e321 \ --hash=sha256:4a58d057208cb9075c144950d789511220b07636dd2e4708d5645d24de666bdc \ --hash=sha256:4e691d7f5186bd2842c14813f79f8884bb03f5995f0575272009982c5ac6c0f7 \ --hash=sha256:5502408cab1cb18e128570f8d598981c68a50d0cbd7c61312a90507cd3a1276f \ --hash=sha256:584c80c24b078eec1e227079d56dc22ff755e0ba8654d8383b2c549107528918 \ --hash=sha256:5ad948d085ed6c16413eb5fec6b3e02fa00dc29a2534f088d3302c47eb59adf9 \ --hash=sha256:670d286910b531c7b7e3c0b453fd8156f250adb140146d234a82219459b9640c \ --hash=sha256:682fa37ff4d8e95f7df6fe6fe6a431e8ed8e788023c6bcc0f0880a12eab80ad1 \ --hash=sha256:6d6c4268598f762bc8e91f5dbf2ab2f61f7b95bdc07953b602db879b3c8c18e1 \ --hash=sha256:79fc6b8699564e1f9b521582c35435f1bd32dd06822322ec44afdeba666d8cb3 \ --hash=sha256:8bdb9d0ce90cbf99c525e75a2fa415144fd570a1ba987380190e8b786bc6ef9b \ --hash=sha256:8fcb9ba3709ff77e77f1c7022ff11d13553f3c30299a9fe246a166903e9091eb \ --hash=sha256:941d4343bf27b605e9213b26bfa1c4bf197c9c599a9627eb7305b0defcfe40c1 \ --hash=sha256:967cf6e3fd4adf7de8fc73cd3043754ae79c36475c1c11d514fc72cf5490094a \ --hash=sha256:970b08dd6b86058b6dc07efe9e98414f5102974716232d10f32ff39701e841c4 \ --hash=sha256:97f50fd18543be72da51dd505e2ed20d2228c74e0464e4262e4899797803d7fa \ --hash=sha256:9bd7d7f544d362576be74f9d5901a22f317efc20046efe2034dced238cbbfe78 \ --hash=sha256:add8bf86b71a5d9fb5b89f023a80b791e04fba57960aa790cc6125f7f1d39dfe \ --hash=sha256:b35d7e5ad269804f6697727702da3c517bb8a5228afa450ab0fa787732055fc9 \ --hash=sha256:b49750419d300e2b5a3813cf229d4e5a4c728dae470bcc89867a9ad6f25a722d \ --hash=sha256:d31b97b3de0f61571a124a00ffe9a81fb9939146c122c11060725bd5aea79975 \ --hash=sha256:d70e77c55ae8380c91c0c18dea05951482e263982911fc7410b1ffd1dadd3440 \ --hash=sha256:d9907d61f15bf7261d7e775bd5d7ee4d2930e04424bab1972591918497623a16 \ --hash=sha256:da5baeaf7116dced9c6bb76dc31ba04a2dc3695f3d9f74741d7910122b456edc \ --hash=sha256:dc74c035f9bfca0255c1af77ddd2d6ae8419012805453e4b0e7513e17904545d \ --hash=sha256:dcafc12c30dbaf1e2af0490978352e0c4041a7cde31f4f81435c2a5e8b9cabb6 \ --hash=sha256:ee44d0f85b803321710f9239f335aafe16553b39106384cef8e6de40cb4ef2f6 \ --hash=sha256:f66a6bbe741bd431f6d741e617e0f39ec7257ca1f89089593479347cc4d13324 # via black requests==2.32.4 \ --hash=sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c \ --hash=sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422 # via # -r requirements_formatting.txt.in # pygithub toml==0.10.2 \ --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f # via # darker # darkgraylib typing-extensions==4.14.1 \ --hash=sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36 \ --hash=sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76 # via pygithub urllib3==2.6.3 \ --hash=sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed \ --hash=sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4 # via # -r requirements_formatting.txt.in # pygithub # requests wrapt==1.15.0 \ --hash=sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0 \ --hash=sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420 \ --hash=sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a \ --hash=sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c \ --hash=sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079 \ --hash=sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923 \ --hash=sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f \ --hash=sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1 \ --hash=sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8 \ --hash=sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86 \ --hash=sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0 \ --hash=sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364 \ --hash=sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e \ --hash=sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c \ --hash=sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e \ --hash=sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c \ --hash=sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727 \ --hash=sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff \ --hash=sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e \ --hash=sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29 \ --hash=sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7 \ --hash=sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72 \ --hash=sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475 \ --hash=sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a \ --hash=sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317 \ --hash=sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2 \ --hash=sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd \ --hash=sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640 \ --hash=sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98 \ --hash=sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248 \ --hash=sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e \ --hash=sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d \ --hash=sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec \ --hash=sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1 \ --hash=sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e \ --hash=sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9 \ --hash=sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92 \ --hash=sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb \ --hash=sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094 \ --hash=sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46 \ --hash=sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29 \ --hash=sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd \ --hash=sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705 \ --hash=sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8 \ --hash=sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975 \ --hash=sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb \ --hash=sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e \ --hash=sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b \ --hash=sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418 \ --hash=sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019 \ --hash=sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1 \ --hash=sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba \ --hash=sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6 \ --hash=sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2 \ --hash=sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3 \ --hash=sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7 \ --hash=sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752 \ --hash=sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416 \ --hash=sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f \ --hash=sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1 \ --hash=sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc \ --hash=sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145 \ --hash=sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee \ --hash=sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a \ --hash=sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7 \ --hash=sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b \ --hash=sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653 \ --hash=sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0 \ --hash=sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90 \ --hash=sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29 \ --hash=sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6 \ --hash=sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034 \ --hash=sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09 \ --hash=sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559 \ --hash=sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639 # via deprecated ================================================ FILE: External/code-format-helper/requirements_formatting.txt.in ================================================ black>=26.3.1 darker==2.1.1 PyGithub==2.6.1 cryptography>=46.0.5 urllib3>=2.6.3 requests>=2.32.4 idna>=3.7 certifi>=2024.7.4 PyNaCl>=1.6.2 PyJWT>=2.12.1 ================================================ FILE: External/tiny-json/CMakeLists.txt ================================================ set(NAME tiny-json) set(SRCS tiny-json.c) add_library(${NAME} STATIC ${SRCS}) target_include_directories(${NAME} PUBLIC ${CMAKE_CURRENT_LIST_DIR}) add_library(${NAME}::${NAME} ALIAS ${NAME}) ================================================ FILE: External/tiny-json/LICENSE ================================================ MIT License Copyright (c) 2018 Rafa Garcia Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: External/tiny-json/tiny-json.c ================================================ /* Licensed under the MIT License . SPDX-License-Identifier: MIT Copyright (c) 2016-2018 Rafa Garcia . Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include // For NULL #include "tiny-json.h" /** Structure to handle a heap of JSON properties. */ typedef struct jsonStaticPool_s { json_t* const mem; /**< Pointer to array of json properties. */ unsigned int const qty; /**< Length of the array of json properties. */ unsigned int nextFree; /**< The index of the next free json property. */ jsonPool_t pool; } jsonStaticPool_t; /* Search a property by its name in a JSON object. */ json_t const* json_getProperty( json_t const* obj, char const* property ) { json_t const* sibling; for( sibling = obj->u.c.child; sibling; sibling = sibling->sibling ) if ( sibling->name && !strcmp( sibling->name, property ) ) return sibling; return 0; } /* Search a property by its name in a JSON object and return its value. */ char const* json_getPropertyValue( json_t const* obj, char const* property ) { json_t const* field = json_getProperty( obj, property ); if ( !field ) return 0; jsonType_t type = json_getType( field ); if ( JSON_ARRAY >= type ) return 0; return json_getValue( field ); } /* Internal prototypes: */ static char* goBlank( char* str ); static char* goNum( char* str ); static json_t* poolInit( jsonPool_t* pool ); static json_t* poolAlloc( jsonPool_t* pool ); static char* objValue( char* ptr, json_t* obj, jsonPool_t* pool ); static char* setToNull( char* ch ); static bool isEndOfPrimitive( char ch ); /* Parse a string to get a json. */ json_t const* json_createWithPool( char *str, jsonPool_t *pool ) { char* ptr = goBlank( str ); if ( !ptr || *ptr != '{' ) return 0; json_t* obj = pool->init( pool ); obj->name = 0; obj->sibling = 0; obj->u.c.child = 0; ptr = objValue( ptr, obj, pool ); if ( !ptr ) return 0; return obj; } /* Parse a string to get a json. */ json_t const* json_create( char* str, json_t mem[], unsigned int qty ) { jsonStaticPool_t spool = { .mem = mem, .qty = qty, .pool = { .init = poolInit, .alloc = poolAlloc } }; return json_createWithPool( str, &spool.pool ); } /** Get a special character with its escape character. Examples: * 'b' -> '\b', 'n' -> '\n', 't' -> '\t' * @param ch The escape character. * @return The character code. */ static char getEscape( char ch ) { static struct { char ch; char code; } const pair[] = { { '\"', '\"' }, { '\\', '\\' }, { '/', '/' }, { 'b', '\b' }, { 'f', '\f' }, { 'n', '\n' }, { 'r', '\r' }, { 't', '\t' }, }; unsigned int i; for( i = 0; i < sizeof pair / sizeof *pair; ++i ) if ( pair[i].ch == ch ) return pair[i].code; return '\0'; } /** Parse 4 characters. * @Param str Pointer to first digit. * @retval '?' If the four characters are hexadecimal digits. * @retcal '\0' In other cases. */ static unsigned char getCharFromUnicode( unsigned char const* str ) { unsigned int i; for( i = 0; i < 4; ++i ) if ( !isxdigit( str[i] ) ) return '\0'; return '?'; } /** Parse a string and replace the scape characters by their meaning characters. * This parser stops when finds the character '\"'. Then replaces '\"' by '\0'. * @param str Pointer to first character. * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* parseString( char* str ) { unsigned char* head = (unsigned char*)str; unsigned char* tail = (unsigned char*)str; for( ; *head >= ' '; ++head, ++tail ) { if ( *head == '\"' ) { *tail = '\0'; return (char*)++head; } if ( *head == '\\' ) { if ( *++head == 'u' ) { char const ch = getCharFromUnicode( ++head ); if ( ch == '\0' ) return 0; *tail = ch; head += 3; } else { char const esc = getEscape( *head ); if ( esc == '\0' ) return 0; *tail = esc; } } else *tail = *head; } return 0; } /** Parse a string to get the name of a property. * @param str Pointer to first character. * @param property The property to assign the name. * @retval Pointer to first of property value. If success. * @retval Null pointer if any error occur. */ static char* propertyName( char* ptr, json_t* property ) { property->name = ++ptr; ptr = parseString( ptr ); if ( !ptr ) return 0; ptr = goBlank( ptr ); if ( !ptr ) return 0; if ( *ptr++ != ':' ) return 0; return goBlank( ptr ); } /** Parse a string to get the value of a property when its type is JSON_TEXT. * @param str Pointer to first character ('\"'). * @param property The property to assign the name. * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* textValue( char* ptr, json_t* property ) { ++property->u.value; ptr = parseString( ++ptr ); if ( !ptr ) return 0; property->type = JSON_TEXT; return ptr; } /** Compare two strings until get the null character in the second one. * @param ptr sub string * @param str main string * @retval Pointer to next character. * @retval Null pointer if any error occur. */ static char* checkStr( char* ptr, char const* str ) { while( *str ) if ( *ptr++ != *str++ ) return 0; return ptr; } /** Parser a string to get a primitive value. * If the first character after the value is different of '}' or ']' is set to '\0'. * @param str Pointer to first character. * @param property Property handler to set the value and the type, (true, false or null). * @param value String with the primitive literal. * @param type The code of the type. ( JSON_BOOLEAN or JSON_NULL ) * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* primitiveValue( char* ptr, json_t* property, char const* value, jsonType_t type ) { ptr = checkStr( ptr, value ); if ( !ptr || !isEndOfPrimitive( *ptr ) ) return 0; ptr = setToNull( ptr ); property->type = type; return ptr; } /** Parser a string to get a true value. * If the first character after the value is different of '}' or ']' is set to '\0'. * @param str Pointer to first character. * @param property Property handler to set the value and the type, (true, false or null). * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* trueValue( char* ptr, json_t* property ) { return primitiveValue( ptr, property, "true", JSON_BOOLEAN ); } /** Parser a string to get a false value. * If the first character after the value is different of '}' or ']' is set to '\0'. * @param str Pointer to first character. * @param property Property handler to set the value and the type, (true, false or null). * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* falseValue( char* ptr, json_t* property ) { return primitiveValue( ptr, property, "false", JSON_BOOLEAN ); } /** Parser a string to get a null value. * If the first character after the value is different of '}' or ']' is set to '\0'. * @param str Pointer to first character. * @param property Property handler to set the value and the type, (true, false or null). * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* nullValue( char* ptr, json_t* property ) { return primitiveValue( ptr, property, "null", JSON_NULL ); } /** Analyze the exponential part of a real number. * @param str Pointer to first character. * @retval Pointer to first non numerical after the string. If success. * @retval Null pointer if any error occur. */ static char* expValue( char* ptr ) { if ( *ptr == '-' || *ptr == '+' ) ++ptr; if ( !isdigit( *ptr ) ) return 0; ptr = goNum( ++ptr ); return ptr; } /** Analyze the decimal part of a real number. * @param str Pointer to first character. * @retval Pointer to first non numerical after the string. If success. * @retval Null pointer if any error occur. */ static char* fraqValue( char* ptr ) { if ( !isdigit( *ptr ) ) return 0; ptr = goNum( ++ptr ); if ( !ptr ) return 0; return ptr; } /** Parser a string to get a numerical value. * If the first character after the value is different of '}' or ']' is set to '\0'. * @param str Pointer to first character. * @param property Property handler to set the value and the type: JSON_REAL or JSON_INTEGER. * @retval Pointer to first non white space after the string. If success. * @retval Null pointer if any error occur. */ static char* numValue( char* ptr, json_t* property ) { if ( *ptr == '-' ) ++ptr; if ( !isdigit( *ptr ) ) return 0; if ( *ptr != '0' ) { ptr = goNum( ptr ); if ( !ptr ) return 0; } else if ( isdigit( *++ptr ) ) return 0; property->type = JSON_INTEGER; if ( *ptr == '.' ) { ptr = fraqValue( ++ptr ); if ( !ptr ) return 0; property->type = JSON_REAL; } if ( *ptr == 'e' || *ptr == 'E' ) { ptr = expValue( ++ptr ); if ( !ptr ) return 0; property->type = JSON_REAL; } if ( !isEndOfPrimitive( *ptr ) ) return 0; if ( JSON_INTEGER == property->type ) { char const* value = property->u.value; bool const negative = *value == '-'; static char const min[] = "-9223372036854775808"; static char const max[] = "9223372036854775807"; unsigned int const maxdigits = ( negative? sizeof min: sizeof max ) - 1; unsigned int const len = ptr - value; if ( len > maxdigits ) return 0; if ( len == maxdigits ) { char const tmp = *ptr; *ptr = '\0'; char const* const threshold = negative ? min: max; if ( 0 > strcmp( threshold, value ) ) return 0; *ptr = tmp; } } ptr = setToNull( ptr ); return ptr; } /** Add a property to a JSON object or array. * @param obj The handler of the JSON object or array. * @param property The handler of the property to be added. */ static void add( json_t* obj, json_t* property ) { property->sibling = 0; if ( !obj->u.c.child ){ obj->u.c.child = property; obj->u.c.last_child = property; } else { obj->u.c.last_child->sibling = property; obj->u.c.last_child = property; } } /** Parser a string to get a json object value. * @param str Pointer to first character. * @param pool The handler of a json pool for creating json instances. * @retval Pointer to first character after the value. If success. * @retval Null pointer if any error occur. */ static char* objValue( char* ptr, json_t* obj, jsonPool_t* pool ) { obj->type = JSON_OBJ; obj->u.c.child = 0; obj->sibling = 0; ptr++; for(;;) { ptr = goBlank( ptr ); if ( !ptr ) return 0; if ( *ptr == ',' ) { ++ptr; continue; } char const endchar = ( obj->type == JSON_OBJ )? '}': ']'; if ( *ptr == endchar ) { *ptr = '\0'; json_t* parentObj = obj->sibling; if ( !parentObj ) return ++ptr; obj->sibling = 0; obj = parentObj; ++ptr; continue; } json_t* property = pool->alloc( pool ); if ( !property ) return 0; if( obj->type != JSON_ARRAY ) { if ( *ptr != '\"' ) return 0; ptr = propertyName( ptr, property ); if ( !ptr ) return 0; } else property->name = 0; add( obj, property ); property->u.value = ptr; switch( *ptr ) { case '{': property->type = JSON_OBJ; property->u.c.child = 0; property->sibling = obj; obj = property; ++ptr; break; case '[': property->type = JSON_ARRAY; property->u.c.child = 0; property->sibling = obj; obj = property; ++ptr; break; case '\"': ptr = textValue( ptr, property ); break; case 't': ptr = trueValue( ptr, property ); break; case 'f': ptr = falseValue( ptr, property ); break; case 'n': ptr = nullValue( ptr, property ); break; default: ptr = numValue( ptr, property ); break; } if ( !ptr ) return 0; } } /** Initialize a json pool. * @param pool The handler of the pool. * @return a instance of a json. */ static json_t* poolInit( jsonPool_t* pool ) { jsonStaticPool_t *spool = json_containerOf( pool, jsonStaticPool_t, pool ); spool->nextFree = 1; return spool->mem; } /** Create an instance of a json from a pool. * @param pool The handler of the pool. * @retval The handler of the new instance if success. * @retval Null pointer if the pool was empty. */ static json_t* poolAlloc( jsonPool_t* pool ) { jsonStaticPool_t *spool = json_containerOf( pool, jsonStaticPool_t, pool ); if ( spool->nextFree >= spool->qty ) return 0; return spool->mem + spool->nextFree++; } /** Checks whether an character belongs to set. * @param ch Character value to be checked. * @param set Set of characters. It is just a null-terminated string. * @return true or false there is membership or not. */ static bool isOneOfThem( char ch, char const* set ) { while( *set != '\0' ) if ( ch == *set++ ) return true; return false; } /** Increases a pointer while it points to a character that belongs to a set. * @param str The initial pointer value. * @param set Set of characters. It is just a null-terminated string. * @return The final pointer value or null pointer if the null character was found. */ static char* goWhile( char* str, char const* set ) { for(; *str != '\0'; ++str ) { if ( !isOneOfThem( *str, set ) ) return str; } return 0; } /** Set of characters that defines a blank. */ static char const* const blank = " \n\r\t\f"; /** Increases a pointer while it points to a white space character. * @param str The initial pointer value. * @return The final pointer value or null pointer if the null character was found. */ static char* goBlank( char* str ) { return goWhile( str, blank ); } /** Increases a pointer while it points to a decimal digit character. * @param str The initial pointer value. * @return The final pointer value or null pointer if the null character was found. */ static char* goNum( char* str ) { for( ; *str != '\0'; ++str ) { if ( !isdigit( *str ) ) return str; } return 0; } /** Set of characters that defines the end of an array or a JSON object. */ static char const* const endofblock = "}]"; /** Set a char to '\0' and increase its pointer if the char is different to '}' or ']'. * @param ch Pointer to character. * @return Final value pointer. */ static char* setToNull( char* ch ) { if ( !isOneOfThem( *ch, endofblock ) ) *ch++ = '\0'; return ch; } /** Indicate if a character is the end of a primitive value. */ static bool isEndOfPrimitive( char ch ) { return ch == ',' || isOneOfThem( ch, blank ) || isOneOfThem( ch, endofblock ); } /** Add a character at the end of a string. * @param dest Pointer to the null character of the string * @param ch Value to be added. * @return Pointer to the null character of the destination string. */ static char* chtoa( char* dest, char ch ) { *dest = ch; *++dest = '\0'; return dest; } /** Copy a null-terminated string. * @param dest Destination memory block. * @param src Source string. * @return Pointer to the null character of the destination string. */ static char* atoa( char* dest, char const* src ) { for( ; *src != '\0'; ++dest, ++src ) *dest = *src; *dest = '\0'; return dest; } /* Open a JSON object in a JSON string. */ char* json_objOpen( char* dest, char const* name ) { if ( NULL == name ) dest = chtoa( dest, '{' ); else { dest = chtoa( dest, '\"' ); dest = atoa( dest, name ); dest = atoa( dest, "\":{" ); } return dest; } /* Close a JSON object in a JSON string. */ char* json_objClose( char* dest ) { if ( dest[-1] == ',' ) --dest; return atoa( dest, "}," ); } /* Open an array in a JSON string. */ char* json_arrOpen( char* dest, char const* name ) { if ( NULL == name ) dest = chtoa( dest, '[' ); else { dest = chtoa( dest, '\"' ); dest = atoa( dest, name ); dest = atoa( dest, "\":[" ); } return dest; } /* Close an array in a JSON string. */ char* json_arrClose( char* dest ) { if ( dest[-1] == ',' ) --dest; return atoa( dest, "]," ); } /** Add the name of a text property. * @param dest Destination memory. * @param name The name of the property. * @return Pointer to the next char. */ static char* strname( char* dest, char const* name ) { dest = chtoa( dest, '\"' ); if ( NULL != name ) { dest = atoa( dest, name ); dest = atoa( dest, "\":\"" ); } return dest; } /** Get the hexadecimal digit of the least significant nibble of a integer. */ static int nibbletoch( int nibble ) { return "0123456789ABCDEF"[ nibble % 16u ]; } /** Get the escape character of a non-printable. * @param ch Character source. * @return The escape character or null character if error. */ static int escape( int ch ) { static struct { char code; char ch; } const pair[] = { { '\"', '\"' }, { '\\', '\\' }, { '/', '/' }, { 'b', '\b' }, { 'f', '\f' }, { 'n', '\n' }, { 'r', '\r' }, { 't', '\t' }, }; for( int i = 0; i < sizeof pair / sizeof *pair; ++i ) if ( ch == pair[i].ch ) return pair[i].code; return '\0'; } /** Copy a null-terminated string inserting escape characters if needed. * @param dest Destination memory block. * @param src Source string. * @return Pointer to the null character of the destination string. */ static char* atoesc( char* dest, char const* src ) { for( ; *src != '\0'; ++dest, ++src ) { if ( *src >= ' ' && *src != '\"' && *src != '\\' && *src != '/' ) *dest = *src; else { *dest++ = '\\'; int const esc = escape( *src ); if ( esc ) *dest = esc; else { *dest++ = 'u'; *dest++ = '0'; *dest++ = '0'; *dest++ = nibbletoch( *src / 16 ); *dest++ = nibbletoch( *src ); } } } *dest = '\0'; return dest; } /* Add a text property in a JSON string. */ char* json_str( char* dest, char const* name, char const* value ) { dest = strname( dest, name ); dest = atoesc( dest, value ); dest = atoa( dest, "\"," ); return dest; } /** Add the name of a primitive property. * @param dest Destination memory. * @param name The name of the property. * @return Pointer to the next char. */ static char* primitivename( char* dest, char const* name ) { if( NULL == name ) return dest; dest = chtoa( dest, '\"' ); dest = atoa( dest, name ); dest = atoa( dest, "\":" ); return dest; } /* Add a boolean property in a JSON string. */ char* json_bool( char* dest, char const* name, int value ) { dest = primitivename( dest, name ); dest = atoa( dest, value ? "true," : "false," ); return dest; } /* Add a null property in a JSON string. */ char* json_null( char* dest, char const* name ) { dest = primitivename( dest, name ); dest = atoa( dest, "null," ); return dest; } /* Used to finish the root JSON object. After call json_objClose(). */ char* json_end( char* dest ) { if ( ',' == dest[-1] ) { dest[-1] = '\0'; --dest; } return dest; } #define ALL_TYPES \ X( json_int, int, "%d" ) \ X( json_long, long, "%ld" ) \ X( json_uint, unsigned int, "%u" ) \ X( json_ulong, unsigned long, "%lu" ) \ X( json_verylong, long long, "%lld" ) \ X( json_double, double, "%g" ) \ #define json_num( funcname, type, fmt ) \ char* funcname( char* dest, char const* name, type value ) { \ dest = primitivename( dest, name ); \ dest += sprintf( dest, fmt, value ); \ dest = chtoa( dest, ',' ); \ return dest; \ } #define X( name, type, fmt ) json_num( name, type, fmt ) ALL_TYPES #undef X ================================================ FILE: External/tiny-json/tiny-json.h ================================================ /* Licensed under the MIT License . SPDX-License-Identifier: MIT Copyright (c) 2016-2018 Rafa Garcia . Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef _TINY_JSON_H_ #define _TINY_JSON_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #include #define json_containerOf( ptr, type, member ) \ ((type*)( (char*)ptr - offsetof( type, member ) )) /** @defgroup tinyJson Tiny JSON parser. * @{ */ /** Enumeration of codes of supported JSON properties types. */ typedef enum { JSON_OBJ, JSON_ARRAY, JSON_TEXT, JSON_BOOLEAN, JSON_INTEGER, JSON_REAL, JSON_NULL } jsonType_t; /** Structure to handle JSON properties. */ typedef struct json_s { struct json_s* sibling; const char* name; union { const char* value; struct { struct json_s* child; struct json_s* last_child; } c; } u; jsonType_t type; } json_t; /** Parse a string to get a json. * @param str String pointer with a JSON object. It will be modified. * @param mem Array of json properties to allocate. * @param qty Number of elements of mem. * @retval Null pointer if any was wrong in the parse process. * @retval If the parser process was successfully a valid handler of a json. * This property is always unnamed and its type is JSON_OBJ. */ const json_t* json_create(char* str, json_t mem[], unsigned int qty); /** Get the name of a json property. * @param json A valid handler of a json property. * @retval Pointer to null-terminated if property has name. * @retval Null pointer if the property is unnamed. */ static inline const char* json_getName(const json_t* json) { return json->name; } /** Get the value of a json property. * The type of property cannot be JSON_OBJ or JSON_ARRAY. * @param json A valid handler of a json property. * @return Pointer to null-terminated string with the value. */ static inline const char* json_getValue(const json_t* property) { return property->u.value; } /** Get the type of a json property. * @param json A valid handler of a json property. * @return The code of type.*/ static inline jsonType_t json_getType(const json_t* json) { return json->type; } /** Get the next sibling of a JSON property that is within a JSON object or array. * @param json A valid handler of a json property. * @retval The handler of the next sibling if found. * @retval Null pointer if the json property is the last one. */ static inline const json_t* json_getSibling(const json_t* json) { return json->sibling; } /** Search a property by its name in a JSON object. * @param obj A valid handler of a json object. Its type must be JSON_OBJ. * @param property The name of property to get. * @retval The handler of the json property if found. * @retval Null pointer if not found. */ const json_t* json_getProperty(const json_t* obj, const char* property); /** Search a property by its name in a JSON object and return its value. * @param obj A valid handler of a json object. Its type must be JSON_OBJ. * @param property The name of property to get. * @retval If found a pointer to null-terminated string with the value. * @retval Null pointer if not found or it is an array or an object. */ const char* json_getPropertyValue(const json_t* obj, const char* property); /** Get the first property of a JSON object or array. * @param json A valid handler of a json property. * Its type must be JSON_OBJ or JSON_ARRAY. * @retval The handler of the first property if there is. * @retval Null pointer if the json object has not properties. */ static inline const json_t* json_getChild(const json_t* json) { return json->u.c.child; } /** Get the value of a json boolean property. * @param property A valid handler of a json object. Its type must be JSON_BOOLEAN. * @return The value stdbool. */ static inline bool json_getBoolean(const json_t* property) { return *property->u.value == 't'; } /** Get the value of a json integer property. * @param property A valid handler of a json object. Its type must be JSON_INTEGER. * @return The value stdint. */ static inline int64_t json_getInteger(const json_t* property) { return atoll( property->u.value ); } /** Get the value of a json real property. * @param property A valid handler of a json object. Its type must be JSON_REAL. * @return The value. */ static inline double json_getReal(const json_t* property) { return atof( property->u.value ); } /** Structure to handle a heap of JSON properties. */ typedef struct jsonPool_s jsonPool_t; struct jsonPool_s { json_t* (*init)( jsonPool_t* pool ); json_t* (*alloc)( jsonPool_t* pool ); }; /** Parse a string to get a json. * @param str String pointer with a JSON object. It will be modified. * @param pool Custom json pool pointer. * @retval Null pointer if any was wrong in the parse process. * @retval If the parser process was successfully a valid handler of a json. * This property is always unnamed and its type is JSON_OBJ. */ const json_t* json_createWithPool(char* str, jsonPool_t* pool); /** @ } */ /** @defgroup makejoson Make JSON. * @{ */ /** Open a JSON object in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @return Pointer to the new end of JSON under construction. */ char* json_objOpen(char* dest, const char* name); /** Close a JSON object in a JSON string. * @param dest Pointer to the end of JSON under construction. * @return Pointer to the new end of JSON under construction. */ char* json_objClose(char* dest); /** Used to finish the root JSON object. After call json_objClose(). * @param dest Pointer to the end of JSON under construction. * @return Pointer to the new end of JSON under construction. */ char* json_end(char* dest); /** Open an array in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @return Pointer to the new end of JSON under construction. */ char* json_arrOpen(char* dest, const char* name); /** Close an array in a JSON string. * @param dest Pointer to the end of JSON under construction. * @return Pointer to the new end of JSON under construction. */ char* json_arrClose(char* dest); /** Add a text property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value A valid null-terminated string with the value. * Backslash escapes will be added for special characters. * @return Pointer to the new end of JSON under construction. */ char* json_str(char* dest, const char* name, const char* value); /** Add a boolean property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Zero for false. Non zero for true. * @return Pointer to the new end of JSON under construction. */ char* json_bool(char* dest, const char* name, int value); /** Add a null property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @return Pointer to the new end of JSON under construction. */ char* json_null(char* dest, const char* name); /** Add an integer property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Value of the property. * @return Pointer to the new end of JSON under construction. */ char* json_int(char* dest, const char* name, int value); /** Add an unsigned integer property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Value of the property. * @return Pointer to the new end of JSON under construction. */ char* json_uint(char* dest, const char* name, unsigned int value); /** Add a long integer property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Value of the property. * @return Pointer to the new end of JSON under construction. */ char* json_long(char* dest, const char* name, long int value); /** Add an unsigned long integer property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Value of the property. * @return Pointer to the new end of JSON under construction. */ char* json_ulong(char* dest, const char* name, unsigned long int value); /** Add a long long integer property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Value of the property. * @return Pointer to the new end of JSON under construction. */ char* json_verylong(char* dest, const char* name, long long int value); /** Add a double precision number property in a JSON string. * @param dest Pointer to the end of JSON under construction. * @param name Pointer to null-terminated string or null for unnamed. * @param value Value of the property. * @return Pointer to the new end of JSON under construction. */ char* json_double(char* dest, const char* name, double value); /** @ } */ #ifdef __cplusplus } #endif #endif /* _TINY_JSON_H_ */ ================================================ FILE: FEXCore/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.14) set(PROJECT_NAME FEXCore) project(${PROJECT_NAME} VERSION 0.01 LANGUAGES CXX) if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") set(ARCHITECTURE_x86_64 1) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcx16") endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64|^arm64|^armv8\.*") set(ARCHITECTURE_arm64 1) endif() set(CMAKE_POSITION_INDEPENDENT_CODE ON) cmake_policy(SET CMP0083 NEW) # Follow new PIE policy include(CheckPIESupported) check_pie_supported() set(CMAKE_INCLUDE_CURRENT_DIR ON) include(CheckCXXCompilerFlag) include(CheckIncludeFileCXX) include(CheckCXXSourceCompiles) set(CMAKE_CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/git_version.h.in ${CMAKE_BINARY_DIR}/generated/git_version.h) include_directories(${CMAKE_BINARY_DIR}/generated) # Disable strict aliasing for all build modes # See discussion in https://github.com/FEX-Emu/FEX/pull/4494#issuecomment-2800608944 # for background context. add_compile_options($<$:-fno-strict-aliasing> $<$:-fno-exceptions>) add_subdirectory(Source/) if (NOT BUILD_STEAM_SUPPORT) install (DIRECTORY include/FEXCore ${CMAKE_BINARY_DIR}/include/FEXCore DESTINATION include COMPONENT Development) endif() if (BUILD_TESTING) add_subdirectory(unittests/) endif() ================================================ FILE: FEXCore/LICENSE ================================================ MIT License Copyright (c) 2019 Ryan Houdek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: FEXCore/Readme.md ================================================ # FEXCore - Fast x86 Core emulation library This is the core emulation library that is used for the FEX emulator project. This project aims to provide a fast and functional x86-64 emulation library that can meet and surpass other x86-64 emulation libraries. ### Goals * Be as fast as possible, beating and exceeding current options for x86-64 emulation * 25% - 50% lower performance than native code would be desired target * Use an IR to efficiently translate x86-64 to our host architecture * Support a tiered recompiler to allow for fast runtime performance * Support offline compilation and offline tooling for inspection and performance analysis * Support threaded emulation. Including emulating x86-64's strong memory model on weak memory model architectures * Support a significant portion of the x86-64 instruction space. * Including MMX, SSE, SSE2, SSE3, SSSE3, and SSE4* * Support fallback routines for uncommonly used x86-64 instructions * Including x87 and 3DNow! * Only support userspace emulation. * All x86-64 instructions run as if they are under CPL-3(userland) security layer * Minimal Linux Syscall emulation for testing purposes * Portable library implementation in order to support easy integration in to applications ### Target Host Architecture The target host architecture for this library is AArch64. Specifically the ARMv8.1 version or newer. The CPU IR is designed with AArch64 in mind but should allow for other architectures as well. x86-64 host support is available for ease of development, but is not a priority. ### Not desired * Kernel space emulation * CPL0-2 emulation * Real Mode, Protected Mode, Virtual-8086 Mode, System Management Mode * IRQs * SVM * "Cycle Accurate" emulation ================================================ FILE: FEXCore/Scripts/config_generator.py ================================================ import datetime import json import sys def print_header(): header = '''#ifndef OPT_BASE #define OPT_BASE(type, group, enum, json, default) #endif #ifndef OPT_BOOL #define OPT_BOOL(group, enum, json, default) OPT_BASE(bool, group, enum, json, default) #endif #ifndef OPT_UINT8 #define OPT_UINT8(group, enum, json, default) OPT_BASE(uint8_t, group, enum, json, default) #endif #ifndef OPT_INT32 #define OPT_INT32(group, enum, json, default) OPT_BASE(int32_t, group, enum, json, default) #endif #ifndef OPT_UINT32 #define OPT_UINT32(group, enum, json, default) OPT_BASE(uint32_t, group, enum, json, default) #endif #ifndef OPT_UINT64 #define OPT_UINT64(group, enum, json, default) OPT_BASE(uint64_t, group, enum, json, default) #endif #ifndef OPT_STR #define OPT_STR(group, enum, json, default) OPT_BASE(fextl::string, group, enum, json, default) #endif #ifndef OPT_STRARRAY #define OPT_STRARRAY(group, enum, json, default) OPT_BASE(fextl::string, group, enum, json, default) #endif #ifndef OPT_STRENUM #define OPT_STRENUM(group, enum, json, default) OPT_BASE(uint64_t, group, enum, json, default) #endif ''' output_file.write(header) def print_tail(): tail = '''#undef OPT_BASE #undef OPT_BOOL #undef OPT_UINT8 #undef OPT_INT32 #undef OPT_UINT32 #undef OPT_UINT64 #undef OPT_STR #undef OPT_STRARRAY #undef OPT_STRENUM ''' output_file.write(tail) def print_config(type, group_name, json_name, default_value): output_file.write("OPT_{0} ({1}, {2}, {3}, {4})\n".format(type.upper(), group_name.upper(), json_name.upper(), json_name, default_value)) def print_options(options): for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): default = op_vals["Default"] if (op_vals["Type"] == "str" or op_vals["Type"] == "strarray"): # Wrap the string argument in quotes default = "\"" + default + "\"" print_config( op_vals["Type"], op_group, op_key, default) output_file.write("\n") def print_unnamed_options(options): output_file.write("// Unnamed configuration options\n") for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): default = op_vals["Default"] if (op_vals["Type"] == "str" or op_vals["Type"] == "strarray"): # Wrap the string argument in quotes default = "\"" + default + "\"" print_config( op_vals["Type"], op_group, op_key.upper(), # KEY is the enum here, there is no json configuration for these default) output_file.write("\n") def print_man_option(short, long, desc, default): if (short != None): output_man.write(".It Fl {0} , ".format(short)) else: output_man.write(".It ") output_man.write("Fl Fl {0}=".format(long)) output_man.write("\n"); # Print description for line in desc: output_man.write(".Pp\n") output_man.write("{0}\n".format(line)) output_man.write(".Pp\n") output_man.write("\\fBdefault:\\fR {0}\n".format(default)) output_man.write(".Pp\n\n") def print_man_env_option(name, desc, default, no_json_key): output_man.write("\\fBFEX_{0}\\fR\n".format(name.upper())) # Print description for line in desc: output_man.write(".Pp\n") output_man.write("{0}\n".format(line)) if (not no_json_key): output_man.write(".Pp\n") output_man.write("\\fBJSON key:\\fR '{0}'\n".format(name)) output_man.write(".Pp\n\n") output_man.write(".Pp\n") output_man.write("\\fBdefault:\\fR {0}\n".format(default)) output_man.write(".Pp\n\n") def print_man_environment(options): output_man.write(".Sh ENVIRONMENT\n") output_man.write(".Bl -tag -width -indent\n") for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): default = op_vals["Default"] value_type = op_vals["Type"] # Textual default rather than enum based if ("TextDefault" in op_vals): default = op_vals["TextDefault"] if (value_type == "str" or value_type == "strarray" or value_type == "strenum"): # Wrap the string argument in quotes default = "'" + default + "'" print_man_env_option( op_key, op_vals["Desc"], default, False ) if (value_type == "strenum"): Enums = op_vals["Enums"] output_man.write("\\fBAvailable Options:\\fR\n") output_man.write(", ".join(f"{enum_op_val}" for [_, enum_op_val] in Enums.items())) output_man.write("\n.sp\n") print_man_environment_tail() output_man.write(".El\n") def print_man_environment_tail(): # Additional environment variables that live outside of the normal loop print_man_env_option( "APP_CONFIG_LOCATION", [ "Allows the user to override where FEX looks for configuration files", "By default FEX will look in ${XDG_CONFIG_HOME, $HOME/.config}/fex-emu/", "This will override the full path", "If FEX_PORTABLE is declared then relative paths are also supported", "For FEX: Relative to the FEX binary", "For WINE: Relative to %LOCALAPPDATA%" ], "''", True) print_man_env_option( "APP_CONFIG", [ "Allows the user to override where FEX looks for only the application config file", "By default FEX will look in ${XDG_CONFIG_HOME, $HOME/.config}/fex-emu/Config.json", "This will override this file location", "One must be careful with this option as it will override any applications that load with execve as well" "If you need to support applications that execve then use FEX_APP_CONFIG_LOCATION instead" "If FEX_PORTABLE is declared then relative paths are also supported", "For FEX: Relative to the FEX binary", "For WINE: Relative to %LOCALAPPDATA%" ], "''", True) print_man_env_option( "APP_DATA_LOCATION", [ "Allows the user to override where FEX looks for data files", "By default FEX will look in {$XDG_DATA_HOME, $HOME/.local/share}/fex-emu/", "This will override the full path", "This is the folder where FEX stores generated files like IR cache" ], "''", True) print_man_env_option( "PORTABLE", [ "Allows FEX to run without installation. Global locations for configuration and binfmt_misc are ignored.", "For FEX on Linux:", "These files are instead read from /fex-emu/ by default.", "For Arm64ec/Wow64 WINE builds:", "These files are instead read from $LOCALAPPDATA/fex-emu/ by default.", "For further customization, see FEX_APP_CONFIG_LOCATION and FEX_APP_DATA_LOCATION." ], "''", True) print_man_env_option( "APP_CACHE_LOCATION", [ "Allows the user to override where FEX stores and loads cache files", "By default FEX will look in ${XDG_CACHE_HOME, $HOME/.cache}/fex-emu/", "This will override the full path, trailing forward-slash is expected to exist", ], "''", True) def print_man_header(): header ='''.Dd {0} .Dt FEX .Os Linux .Sh NAME .Nm FEX .Nm FEXBash .Nd Fast x86-64 and x86 emulation. .Sh SYNOPSIS .Nm .Ar ... .Pp .Nm FEXBash .Ar ... .Sh DESCRIPTION FEX allows you to run x86 and x86-64 binaries on an AArch64 host, similar to qemu-user and box86. It has native support for a rootfs overlay, so you don't need to chroot, as well as some thunklibs so it can forward things like GL to the host. FEX presents a Linux 5.0 interface to the guest, and supports both AArch64 and x86-64 as hosts. FEX is very much work in progress, so expect things to change. ''' output_man.write(header.format(datetime.datetime.now().strftime("%d-%m-%Y"))) def print_man_tail(): tail ='''.Sh FILES .Bl -tag -width "$prefix/share/fex-emu/GuestThunks" -compact .It Pa $XDG_CONFIG_DIR/fex-emu Default FEX user configuration directory .It Pa $prefix/share/fex-emu/AppConfig System level application configuration files .It Pa $prefix/share/fex-emu/GuestThunks guest-side thunk data libraries .It Pa $prefix/lib/fex-emu/HostThunks host-side thunks for guest communication .El ''' output_man.write(tail) def print_config_option(type, group_name, json_name, default_value, short, choices, desc): if (type == "bool"): # Bool gets some special handling to add an inverted case output_argloader.write("{0}Group".format(group_name)) options = "" AddedArg = False if (short != None): AddedArg = True options += "\"-{0}\"".format(short) if (AddedArg): options += ", " options += "\"--{0}\"".format(json_name.lower()) output_argloader.write(".add_option({0})".format(options)) output_argloader.write("\n") output_argloader.write("\t.action(\"store_true\")\n") output_argloader.write("\t.dest(\"{0}\")\n".format(json_name)); # help output_argloader.write("\t.help(\n") desc_line_ender = "" if (len(desc) > 1): desc_line_ender = "\\n" for line in desc: output_argloader.write("\t\t\"{0}{1}\"\n".format(line, desc_line_ender)) output_argloader.write("\t)\n") output_argloader.write("\t.set_default({0});\n\n".format(default_value)); output_argloader.write("{0}Group".format(group_name)) output_argloader.write(".add_option(\"--no-{0}\")\n".format(json_name.lower())) # Inverted case sets the bool to false output_argloader.write("\t.action(\"store_false\")\n") output_argloader.write("\t.dest(\"{0}\");\n".format(json_name)); else: output_argloader.write("{0}Group".format(group_name)) options = "" AddedArg = False if (short != None): AddedArg = True options += "\"-{0}\"".format(short) if (AddedArg): options += ", " options += "\"--{0}\"".format(json_name.lower()) output_argloader.write(".add_option({0})".format(options)) output_argloader.write("\n") output_argloader.write("\t.dest(\"{0}\")\n".format(json_name)); if (choices != None): output_argloader.write("\t.choices({\n") for choice in choices: output_argloader.write("\t\t\"{0}\",\n".format(choice)) output_argloader.write("\t})\n") # help output_argloader.write("\t.help(\n") desc_line_ender = "" if (len(desc) > 1): desc_line_ender = "\\n" for line in desc: output_argloader.write("\t\t\"{0}{1}\"\n".format(line, desc_line_ender)) output_argloader.write("\t)\n") output_argloader.write("\t.set_default({0});\n".format(default_value)); output_argloader.write("\n"); def print_parse_envloader_options(options): output_argloader.write("#ifdef ENVLOADER\n") output_argloader.write("#undef ENVLOADER\n") output_argloader.write("if (false) {}\n") for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): value_type = op_vals["Type"] if (value_type == "strenum"): output_argloader.write("else if (Key == \"FEX_{0}\") {{\n".format(op_key.upper())) output_argloader.write("\tValue = FEXCore::Config::EnumParser(FEXCore::Config::{}_EnumPairs, Value_View);\n".format(op_key, op_key)) output_argloader.write("}\n") if ("ArgumentHandler" in op_vals): conversion_func = "FEXCore::Config::Handler::{0}".format(op_vals["ArgumentHandler"]) output_argloader.write("else if (Key == \"FEX_{0}\") {{\n".format(op_key.upper())) output_argloader.write("\tValue = {0}(Value_View);\n".format(conversion_func)) output_argloader.write("}\n") output_argloader.write("#endif\n") def print_parse_jsonloader_options(options): output_argloader.write("#ifdef JSONLOADER\n") output_argloader.write("#undef JSONLOADER\n") output_argloader.write("if (false) {}\n") op_key = None for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): value_type = op_vals["Type"] if (value_type == "strenum"): output_argloader.write("else if (KeyName == \"{0}\") {{\n".format(op_key)) output_argloader.write("\tSet(KeyOption, FEXCore::Config::EnumParser(FEXCore::Config::{}_EnumPairs, Value_View));\n".format(op_key, op_key)) output_argloader.write("}\n") elif (value_type == "strarray"): output_argloader.write("else if (KeyName == \"{0}\") {{\n".format(op_key)) output_argloader.write("\tAppendStrArrayValue(KeyOption, ConfigString);\n") output_argloader.write("}\n") assert op_key is not None, "No options found in JSONLOADER" output_argloader.write("else {\n") output_argloader.write("\tSet(KeyOption, ConfigString);\n") output_argloader.write("}\n") output_argloader.write("#endif\n") def print_parse_enum_options(options): output_argloader.write("#ifdef ENUMDEFINES\n") output_argloader.write("#undef ENUMDEFINES\n") for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): if (op_vals["Type"] == "strenum"): output_argloader.write("enum class {} : uint64_t {{\n".format(op_key)) Enums = op_vals["Enums"] i = 0 # Always have an OFF. output_argloader.write("\tOFF = 0,\n") for enum_op_key, enum_op_vals in Enums.items(): output_argloader.write("\t{} = 1ULL << {},\n".format(enum_op_key.upper(), i)) i += 1 output_argloader.write("};\n") output_argloader.write("FEX_DEF_NUM_OPS({})\n".format(op_key)) for op_group, group_vals in options.items(): for op_key, op_vals in group_vals.items(): if (op_vals["Type"] == "strenum"): Enums = op_vals["Enums"] output_argloader.write("using {}ConfigPair = std::pair;\n".format(op_key, op_key)) output_argloader.write("constexpr static std::array<{}ConfigPair, {}> {}_EnumPairs = {{{{\n".format(op_key, len(Enums) + 1, op_key)) i = 0 # Always have an OFF. output_argloader.write("\t{{ \"off\", FEXCore::Config::{}::OFF }},\n".format(op_key)) for enum_op_key, enum_op_vals in Enums.items(): output_argloader.write("\t{{ \"{}\", FEXCore::Config::{}::{} }},\n".format(enum_op_vals, op_key, enum_op_key.upper())) i += 1 output_argloader.write("}};\n") output_argloader.write("#endif\n") if (len(sys.argv) < 5): sys.exit() output_filename = sys.argv[2] output_man_page = sys.argv[3] output_argumentloader_filename = sys.argv[4] json_file = open(sys.argv[1], "r") json_text = json_file.read() json_file.close() json_object = json.loads(json_text) options = json_object["Options"] unnamed_options = json_object["UnnamedOptions"] # Generate config include file output_file = open(output_filename, "w") print_header() print_options(options) print_unnamed_options(unnamed_options) print_tail() output_file.close() # Generate man file output_man = open(output_man_page, "w") print_man_header() print_man_environment(options) print_man_tail() output_man.close() # Generate argument loader code output_argloader = open(output_argumentloader_filename, "w") # Generate environment loader code print_parse_envloader_options(options); # Generate json loader code print_parse_jsonloader_options(options); # Generate enum variable options print_parse_enum_options(options); output_argloader.close() ================================================ FILE: FEXCore/Scripts/json_ir_doc_generator.py ================================================ import collections import json import sys OpClasses = collections.OrderedDict() def get_ir_classes(ops, defines): global OpClasses for op_class, opslist in ops.items(): if not (op_class in OpClasses): OpClasses[op_class] = [] for op, op_val in opslist.items(): OpClasses[op_class].append([op, op_val]) # Sort the dictionary after we are done parsing it OpClasses = collections.OrderedDict(sorted(OpClasses.items())) def print_ir_op_index(): output_file.write("# Index\n") output_file.write("## Op Classes\n") for class_key, class_value in OpClasses.items(): output_file.write("- [%s](#%s)\n\n" % (class_key, class_key)) output_file.write("## Definitions\n") output_file.write("- [Defines](#Defines)\n\n") def print_ir_ops(): for class_key, class_value in OpClasses.items(): output_file.write("# %s\n\n" % (class_key)) for op in class_value: op_key = op[0] op_vals = op[1] output_file.write("## %s\n" % (op_key)) output_file.write(">") output_file.write(op_key) output_file.write("\n\n") if ("Desc" in op_vals): desc = op_vals["Desc"] if (isinstance(desc, list)): for line in desc: output_file.write("%s\n\n" % line) else: output_file.write("%s\n" % op_vals["Desc"]) else: output_file.write("XXX: Missing op desc!\n") def print_ir_defines(defines): output_file.write("## Defines\n") output_file.write("```cpp\n") for define in defines: output_file.write("%s\n" % (define)) output_file.write("```\n") if (len(sys.argv) < 3): sys.exit() output_filename = sys.argv[2] json_file = open(sys.argv[1], "r") json_text = json_file.read() json_file.close() json_object = json.loads(json_text) json_object = {k.upper(): v for k, v in json_object.items()} ops = json_object["OPS"] defines = json_object["DEFINES"] get_ir_classes(ops, defines) output_file = open(output_filename, "w") print_ir_op_index() output_file.write("# IR documentation\n\n") print_ir_ops() print_ir_defines(defines) output_file.close() ================================================ FILE: FEXCore/Scripts/json_ir_generator.py ================================================ #!/bin/python3 import json import sys from dataclasses import dataclass, field import textwrap def ExitError(msg): print(msg) sys.exit(-1) @dataclass class IRType: IRName: str CXXName: str def __init__(self, IRName, CXXName): self.IRName = IRName self.CXXName = CXXName @dataclass class OpArgument: Type: str IsSSA: bool Temporary: bool Name: str NameWithPrefix: str DefaultInitializer: str def __init__(self): self.Type = None self.IsSSA = False self.Temporary = False self.Name = None self.NameWithPrefix = None self.DefaultInitializer = None return def print(self): attrs = vars(self) print(", ".join("%s: %s" % item for item in attrs.items())) @dataclass class OpDefinition: Name: str HasDest: bool DestType: str DestSize: str ElementSize: str OpClass: str HasSideEffects: bool ImplicitFlagClobber: bool RAOverride: int SwitchGen: bool ArgPrinter: bool SSAArgNum: int NonSSAArgNum: int DynamicDispatch: bool LoweredX87: bool JITDispatch: bool JITDispatchOverride: str TiedSource: int Inline: list[str] Arguments: list[OpArgument] EmitValidation: list[str] Desc: list[str] def __init__(self): self.Name = None self.HasDest = False self.DestType = None self.DestSize = None self.ElementSize = None self.OpClass = None self.OpSize = 0 self.HasSideEffects = False self.ImplicitFlagClobber = False self.RAOverride = -1 self.SwitchGen = True self.ArgPrinter = True self.SSAArgNum = 0 self.NonSSAArgNum = 0 self.DynamicDispatch = False self.LoweredX87 = False self.JITDispatch = True self.JITDispatchOverride = None self.TiedSource = -1 self.Arguments = [] self.EmitValidation = [] self.Desc = [] return def print(self): attrs = vars(self) print(", ".join("%s: %s" % item for item in attrs.items())) IRTypesToCXX: dict[str, IRType] = {} CXXTypeToIR: dict[str, IRType] = {} IROps: list[OpDefinition] = [] IROpNameSet: set[str] = set() def is_ssa_type(op_type: str): return op_type in {"SSA", "GPR", "GPRPair", "FPR"} def parse_irtypes(irtypes): for op_key, op_val in irtypes.items(): IRTypesToCXX[op_key] = IRType(op_key, op_val) CXXTypeToIR[op_val] = IRType(op_key, op_val) def parse_ops(ops): for op_class, opslist in ops.items(): for op, op_val in opslist.items(): if "Ignore" in op_val: # Skip these continue OpDef = OpDefinition() # Check if we have a destination # Only happens if the IR name contains `=` EqualSplit = op.split("=", 1) RHS = EqualSplit[0].strip() if len(EqualSplit) > 1: LHS = EqualSplit[0].strip() RHS = EqualSplit[1].strip() if ":" in LHS: # Named destinations. This is a hack, but so is the entire # multi-destination support bolten onto the old IR... # # Named destinations require side effects because they break # SSA hard. Validate that. assert("HasSideEffects" in op_val and op_val["HasSideEffects"]) for Dest in LHS.split(","): Dest = Dest.strip() DType, Name = Dest.split(":$") # If the destination appears also as a source, it is # read-modify-write. if Dest in RHS: # Turn RMW into an in/out source RHS = RHS.replace(Dest.strip(), f"{DType}:$Inout{Name}") else: # Turn named destinations into an out source. RHS += f", {DType}:$Out{Name}" else: # Single anonymous destination if LHS not in ["SSA", "GPR", "GPRPair", "FPR"]: ExitError(f"Unknown destination class type {LHS}. Needs to be one of SSA, GPR, GPRPair, FPR") OpDef.HasDest = True OpDef.DestType = LHS # IR Op needs to start with a name RHS = RHS.split(" ", 1) if len(RHS) < 1: ExitError("Missing IR op name. Needs to be a string") # Set the op name OpDef.Name = RHS[0] # Parse the arguments if len(RHS) > 1: Arguments = RHS[1].strip().split(",") for Argument in Arguments: Argument = Argument.strip() OpArg = OpArgument() Split = Argument.split(":", 1) if len(Split) != 2: ExitError("Error parsing argument. Missing Type and name colon split") # Type is the first argument OpArg.Type = Split[0] # Validate typing is in our type map if not OpArg.Type in IRTypesToCXX: ExitError("IR type {} isn't in IR type map. From IR op {}, argument {}".format(OpArg.Type, OpDef.Name, Argument)) # Style is the first byte of the name if Split[1][0] == "#": OpArg.Temporary = True OpArg.IsSSA = False elif Split[1][0] == "$": OpArg.Temporary = False OpArg.IsSSA = is_ssa_type(OpArg.Type) if OpArg.IsSSA: OpDef.SSAArgNum = OpDef.SSAArgNum + 1 else: OpDef.NonSSAArgNum = OpDef.NonSSAArgNum + 1 else: ExitError("IR Op {} missing value argument style specifier. Needs to be one of {{#, $}}".format(OpDef.Name)) Prefix = Split[1][0] ArgName = Split[1][1:] NameWithPrefix = Prefix + ArgName if len(ArgName) == 0: ExitError("Argument is missing variable name") DefaultInit = ArgName.split("{", 1) if len(DefaultInit) > 1: # We have a default initializer, need to do some more work # First argument will still be the argument name ArgName = DefaultInit[0].strip() NameWithPrefix = Prefix + ArgName # Second argument will be the default initializer # Since we stripped the opening curly brace then it'll end with a closing brace if DefaultInit[1][-1] != "}": ExitError("IR op {} Argument {} is missing closing curly brace in default initializer?".format(OpDef.Name, ArgName)) OpArg.DefaultInitializer = DefaultInit[1][:-1] # If SSA type then we can generate validation for this op if OpArg.IsSSA and OpArg.Type in {"GPR", "GPRPair", "FPR"}: OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == RegClass::Invalid || WalkFindRegClass({ArgName}) == RegClass::{OpArg.Type}") OpArg.Name = ArgName OpArg.NameWithPrefix = NameWithPrefix OpDef.Arguments.append(OpArg) # Additional metadata if "DestSize" in op_val: OpDef.DestSize = op_val["DestSize"] if "ElementSize" in op_val: OpDef.ElementSize = op_val["ElementSize"] if len(op_class): OpDef.OpClass = op_class if "HasSideEffects" in op_val: OpDef.HasSideEffects = bool(op_val["HasSideEffects"]) if "ImplicitFlagClobber" in op_val: OpDef.ImplicitFlagClobber = bool(op_val["ImplicitFlagClobber"]) if "ArgPrinter" in op_val: OpDef.ArgPrinter = bool(op_val["ArgPrinter"]) if "RAOverride" in op_val: OpDef.RAOverride = int(op_val["RAOverride"]) if "SwitchGen" in op_val: OpDef.SwitchGen = op_val["SwitchGen"] if "EmitValidation" in op_val: OpDef.EmitValidation.extend(op_val["EmitValidation"]) if "Desc" in op_val: OpDef.Desc = op_val["Desc"] if "DynamicDispatch" in op_val: OpDef.DynamicDispatch = bool(op_val["DynamicDispatch"]) if "JITDispatch" in op_val: OpDef.JITDispatch = bool(op_val["JITDispatch"]) if "JITDispatchOverride" in op_val: OpDef.JITDispatchOverride = op_val["JITDispatchOverride"] if "X87" in op_val: OpDef.LoweredX87 = op_val["X87"] # X87 implies !JITDispatch assert("JITDispatch" not in op_val) OpDef.JITDispatch = False if "TiedSource" in op_val: OpDef.TiedSource = op_val["TiedSource"] # Pad Inline out to the argument count OpDef.Inline = [''] * len(OpDef.Arguments) if "Inline" in op_val: Value = op_val["Inline"] OpDef.Inline[0:len(Value)] = Value # Do some fixups of the data here if len(OpDef.EmitValidation) != 0: for i in range(len(OpDef.EmitValidation)): # Patch up all the argument names for Arg in OpDef.Arguments: # Temporary ops just replace all instances no prefix variant OpDef.EmitValidation[i] = OpDef.EmitValidation[i].replace(Arg.NameWithPrefix, Arg.Name) #OpDef.print() # Error on duplicate op if OpDef.Name in IROpNameSet: ExitError("Duplicate Op defined! {}".format(OpDef.Name)) IROps.append(OpDef) IROpNameSet.add(OpDef.Name) # Print out enum values def print_enums(enums): output_file.write("#ifdef IROP_ENUM\n") output_file.write("enum IROps : uint16_t {\n") for op in IROps: output_file.write("\tOP_{},\n" .format(op.Name.upper())) output_file.write("};\n") for name, members in enums.items(): output_file.write(f"enum {name} {{\n") for member in members: if member: output_file.write(f"\t{member}\n") else: output_file.write("\n") output_file.write("};\n\n") output_file.write("#undef IROP_ENUM\n") output_file.write("#endif\n\n") def print_ir_structs(defines): output_file.write("#ifdef IROP_STRUCTS\n") # Print out defines here for op_val in defines: if op_val: output_file.write("\t%s;\n" % op_val) else: output_file.write("\n") # Emit the default struct first output_file.write("// Default structs\n") output_file.write("struct __attribute__((packed)) IROp_Header {\n") output_file.write("\tvoid* Data[0];\n") output_file.write("\tIROps Op;\n\n") output_file.write("\tIR::OpSize Size;\n") output_file.write("\tIR::OpSize ElementSize;\n") output_file.write("\ttemplate\n") output_file.write("\tT const* C() const { return reinterpret_cast(Data); }\n") output_file.write("\ttemplate\n") output_file.write("\tT* CW() { return reinterpret_cast(Data); }\n") output_file.write("\tOrderedNodeWrapper Args[0];\n") output_file.write("};\n\n"); output_file.write("static_assert(sizeof(IROp_Header) == sizeof(uint32_t), \"IROp_Header should be 32-bits in size\");\n\n"); # Now the user defined types output_file.write("// User defined IR Op structs\n") for op in IROps: output_file.write("struct __attribute__((packed)) IROp_{} {{\n".format(op.Name)) output_file.write("\tIROp_Header Header;\n") # SSA arguments have a hard requirement to appear after the header if op.SSAArgNum > 0: output_file.write("\t// SSA arguments\n") # Walk the SSA arguments and place them in order of declaration for arg in op.Arguments: if arg.IsSSA: output_file.write("\tOrderedNodeWrapper {};\n".format(arg.Name)); # Non-SSA arguments are also placed in order of declaration, after SSA though if op.NonSSAArgNum > 0: output_file.write("\t// Non-SSA arguments\n") for arg in op.Arguments: if not arg.Temporary and not arg.IsSSA: CType = IRTypesToCXX[arg.Type].CXXName output_file.write("\t{} {};\n".format(CType, arg.Name)); output_file.write("\tstatic constexpr IROps OPCODE = OP_{};\n".format(op.Name.upper())) if op.SSAArgNum > 0: output_file.write("\t// Get index of argument by name\n") SSAArg = 0 for arg in op.Arguments: if arg.IsSSA: output_file.write("\tstatic constexpr size_t {}_Index = {};\n".format(arg.Name, SSAArg)) SSAArg = SSAArg + 1 output_file.write("};\n") # Add a static assert that the IR ops must be pod output_file.write("static_assert(std::is_trivially_copyable_v);\n".format(op.Name)) output_file.write("static_assert(std::is_standard_layout_v);\n\n".format(op.Name)) output_file.write("#undef IROP_STRUCTS\n") output_file.write("#endif\n\n") # Print out const expression to calculate IR Op sizes def print_ir_sizes(): output_file.write("#ifdef IROP_SIZES\n") output_file.write("constexpr std::array IRSizes = {\n") for op in IROps: if op.Name == "Last": output_file.write("\t-1ULL,\n") else: output_file.write(f"\tsizeof(IROp_{op.Name}),\n") output_file.write(textwrap.dedent(""" }; // Make sure our array maps directly to the IROps enum static_assert(IRSizes[IROps::OP_LAST] == -1ULL); [[nodiscard]] inline size_t GetSize(IROps Op) { return IRSizes[Op]; } [[nodiscard, gnu::const]] std::string_view const& GetName(IROps Op); [[nodiscard, gnu::const]] uint8_t GetArgs(IROps Op); [[nodiscard, gnu::const]] uint8_t GetRAArgs(IROps Op); [[nodiscard, gnu::const]] FEXCore::IR::RegClass GetRegClass(IROps Op); [[nodiscard, gnu::const]] bool HasSideEffects(IROps Op); [[nodiscard, gnu::const]] bool ImplicitFlagClobber(IROps Op); [[nodiscard, gnu::const]] bool GetHasDest(IROps Op); [[nodiscard, gnu::const]] bool LoweredX87(IROps Op); [[nodiscard, gnu::const]] int8_t TiedSource(IROps Op); #undef IROP_SIZES #endif """)) def print_ir_reg_classes(): output_file.write("#ifdef IROP_REG_CLASSES_IMPL\n") output_file.write("constexpr std::array IRRegClasses = {\n") for op in IROps: if op.Name == "Last": output_file.write("\tRegClass::Invalid,\n") else: if op.HasDest and op.DestType is None: ExitError("IR op {} has destination with no destination class".format(op.Name)) if op.HasDest and op.DestType == "SSA": # Special case SSA type output_file.write("\tRegClass::Complex,\n") elif op.HasDest: output_file.write("\tRegClass::{},\n".format(op.DestType)) else: # No destination so it has an invalid destination class output_file.write("\tRegClass::Invalid, // No destination\n") output_file.write("};\n\n") output_file.write("// Make sure our array maps directly to the IROps enum\n") output_file.write("static_assert(IRRegClasses[IROps::OP_LAST] == RegClass::Invalid);\n\n") output_file.write("FEXCore::IR::RegClass GetRegClass(IROps Op) { return IRRegClasses[Op]; }\n\n") output_file.write("#undef IROP_REG_CLASSES_IMPL\n") output_file.write("#endif\n\n") # Print out the name printer implementation def print_ir_getname(): output_file.write("#ifdef IROP_GETNAME_IMPL\n") output_file.write("constexpr std::array IRNames = {\n") for op in IROps: output_file.write("\t\"{}\",\n".format(op.Name)) output_file.write("};\n\n") output_file.write("static_assert(IRNames[OP_LAST] == \"Last\");\n\n") output_file.write("std::string_view const& GetName(IROps Op) {\n") output_file.write(" return IRNames[Op];\n") output_file.write("}\n") output_file.write("#undef IROP_GETNAME_IMPL\n") output_file.write("#endif\n\n") # Print out the number of SSA args that need to be RA'd def print_ir_getraargs(): output_file.write("#ifdef IROP_GETRAARGS_IMPL\n") output_file.write("constexpr std::array IRRAArgs = {\n") for op in IROps: SSAArgs = op.SSAArgNum if op.RAOverride != -1: if op.RAOverride > op.SSAArgNum: ExitError("Op {} has RA override of {} which is more than total SSA values {}. This doesn't work".format(op.Name, op.RAOverride, op.SSAArgNum)) SSAArgs = op.RAOverride output_file.write("\t{},\n".format(SSAArgs)) output_file.write("};\n\n") output_file.write("constexpr std::array IRArgs = {\n") for op in IROps: SSAArgs = op.SSAArgNum output_file.write("\t{},\n".format(SSAArgs)) output_file.write("};\n\n") output_file.write("uint8_t GetRAArgs(IROps Op) {\n") output_file.write(" return IRRAArgs[Op];\n") output_file.write("}\n") output_file.write("uint8_t GetArgs(IROps Op) {\n") output_file.write(" return IRArgs[Op];\n") output_file.write("}\n") output_file.write("#undef IROP_GETRAARGS_IMPL\n") output_file.write("#endif\n\n") def print_ir_hassideeffects(): output_file.write("#ifdef IROP_HASSIDEEFFECTS_IMPL\n") for prop, T in [ ("HasSideEffects", "bool"), ("ImplicitFlagClobber", "bool"), ("LoweredX87", "bool"), ("TiedSource", "int8_t"), ]: output_file.write( f"constexpr std::array<{'uint8_t' if T == 'bool' else T}, OP_LAST + 1> {prop}_ = {{\n" ) for op in IROps: if T == "bool": output_file.write( "\t{},\n".format(("true" if getattr(op, prop) else "false")) ) else: output_file.write(f"\t{getattr(op, prop)},\n") output_file.write("};\n\n") output_file.write(f"{T} {prop}(IROps Op) {{\n") output_file.write(f" return {prop}_[Op];\n") output_file.write("}\n") output_file.write("#undef IROP_HASSIDEEFFECTS_IMPL\n") output_file.write("#endif\n\n") def print_ir_gethasdest(): output_file.write("#ifdef IROP_GETHASDEST_IMPL\n") output_file.write("constexpr std::array IRDest = {\n") for op in IROps: if op.HasDest: output_file.write("\ttrue,\n") else: output_file.write("\tfalse,\n") output_file.write("};\n\n") output_file.write("bool GetHasDest(IROps Op) {\n") output_file.write(" return IRDest[Op];\n") output_file.write("}\n") output_file.write("#undef IROP_GETHASDEST_IMPL\n") output_file.write("#endif\n\n") # Print out IR argument printing def print_ir_arg_printer(): output_file.write("#ifdef IROP_ARGPRINTER_HELPER\n") output_file.write("switch (IROp->Op) {\n") for op in IROps: if not op.ArgPrinter: continue output_file.write("case IROps::OP_{}: {{\n".format(op.Name.upper())) if len(op.Arguments) != 0: output_file.write("\t[[maybe_unused]] auto Op = IROp->C();\n".format(op.Name)) output_file.write("\t*out << \" \";\n") SSAArgNum = 0 FirstArg = True for arg in op.Arguments: # No point printing temporaries that we can't recover if arg.Temporary: continue if FirstArg: FirstArg = False else: output_file.write('\t*out << ", ";\n') if arg.IsSSA: # SSA value output_file.write("\tPrintArg(out, IR, Op->Header.Args[{}]);\n".format(SSAArgNum)) SSAArgNum = SSAArgNum + 1 else: # User defined op that is stored output_file.write("\tPrintArg(out, IR, Op->{});\n".format(arg.Name)) output_file.write("break;\n") output_file.write("}\n") output_file.write("#undef IROP_ARGPRINTER_HELPER\n") output_file.write("#endif\n") def print_validation(op): if len(op.EmitValidation) != 0: output_file.write("#if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED\n") for Validation in op.EmitValidation: Sanitized = Validation.replace("\"", "\\\"") output_file.write("\t\tLOGMAN_THROW_A_FMT({}, \"{}\");\n".format(Validation, Sanitized)) output_file.write("#endif\n") # Print out IR allocator helpers def print_ir_allocator_helpers(): output_file.write("#ifdef IROP_ALLOCATE_HELPERS\n") output_file.write("\ttemplate \n") output_file.write("\tstruct Wrapper final {\n") output_file.write("\t\tT *first;\n") output_file.write("\t\tOrderedNode *Node; ///< Actual offset of this IR in ths list\n") output_file.write("\n") output_file.write("\t\toperator Wrapper() const { return Wrapper {reinterpret_cast(first), Node}; }\n") output_file.write("\t\toperator OrderedNode *() { return Node; }\n") output_file.write("\t\toperator const OrderedNode *() const { return Node; }\n") output_file.write("\t\toperator OpNodeWrapper () const { return Node->Header.Value; }\n") output_file.write("\t};\n") output_file.write("\ttemplate \n") output_file.write("\tusing IRPair = Wrapper;\n\n") output_file.write("\tIRPair AllocateRawOp(size_t HeaderSize) {\n") output_file.write("\t\tauto Op = reinterpret_cast(DualListData.DataAllocate(HeaderSize));\n") output_file.write("\t\tmemset(Op, 0, HeaderSize);\n") output_file.write("\t\tOp->Op = IROps::OP_DUMMY;\n") output_file.write("\t\treturn IRPair{Op, CreateNode(Op)};\n") output_file.write("\t}\n\n") output_file.write("\ttemplate\n") output_file.write("\tT *AllocateOrphanOp() {\n") output_file.write("\t\tsize_t Size = FEXCore::IR::GetSize(T2);\n") output_file.write("\t\tauto Op = reinterpret_cast(DualListData.DataAllocate(Size));\n") output_file.write("\t\tmemset(Op, 0, Size);\n") output_file.write("\t\tOp->Header.Op = T2;\n") output_file.write("\t\treturn Op;\n") output_file.write("\t}\n\n") output_file.write("\ttemplate\n") output_file.write("\tIRPair AllocateOp() {\n") output_file.write("\t\tsize_t Size = FEXCore::IR::GetSize(T2);\n") output_file.write("\t\tauto Op = reinterpret_cast(DualListData.DataAllocate(Size));\n") output_file.write("\t\tmemset(Op, 0, Size);\n") output_file.write("\t\tOp->Header.Op = T2;\n") output_file.write("\t\treturn IRPair{Op, CreateNode(&Op->Header)};\n") output_file.write("\t}\n\n") output_file.write("\tIR::OpSize GetOpSize(const OrderedNode *Op) const {\n") output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n") output_file.write("\t\treturn HeaderOp->Size;\n") output_file.write("\t}\n\n") output_file.write("\tIR::OpSize GetOpElementSize(const OrderedNode *Op) const {\n") output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n") output_file.write("\t\treturn HeaderOp->ElementSize;\n") output_file.write("\t}\n\n") output_file.write("\tuint8_t GetOpElements(const OrderedNode *Op) const {\n") output_file.write("\t\tLOGMAN_THROW_A_FMT(OpHasDest(Op), \"Op {} has no dest\\n\", GetOpName(Op));\n") output_file.write("\t\treturn IR::OpSizeToSize(GetOpSize(Op)) / IR::OpSizeToSize(GetOpElementSize(Op));\n") output_file.write("\t}\n\n") output_file.write("\tbool OpHasDest(const OrderedNode *Op) const {\n") output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n") output_file.write("\t\treturn GetHasDest(HeaderOp->Op);\n") output_file.write("\t}\n\n") output_file.write("\tIROps GetOpType(const OrderedNode *Op) const {\n") output_file.write("\t\tauto HeaderOp = Op->Header.Value.GetNode(DualListData.DataBegin());\n") output_file.write("\t\treturn HeaderOp->Op;\n") output_file.write("\t}\n\n") output_file.write("\tFEXCore::IR::RegClass GetOpRegClass(const OrderedNode *Op) const {\n") output_file.write("\t\treturn GetRegClass(GetOpType(Op));\n") output_file.write("\t}\n\n") output_file.write("\tstd::string_view const& GetOpName(const OrderedNode *Op) const {\n") output_file.write("\t\treturn IR::GetName(GetOpType(Op));\n") output_file.write("\t}\n\n") # Generate helpers with operands for op in IROps: if op.Name != "Last": output_file.write("\t///\n".join(["\t/// {}\n" .format(comment) for comment in op.Desc])) output_file.write("\tIRPair _{}(" .format(op.Name, op.Name)) # Output SSA args first for i, arg in enumerate(op.Arguments): LastArg = i == len(op.Arguments) - 1 if arg.Temporary: CType = IRTypesToCXX[arg.Type].CXXName output_file.write("{} {}".format(CType, arg.Name)) elif arg.IsSSA: # SSA value output_file.write("OrderedNodeWrapper {}".format(arg.Name)) else: # User defined op that is stored CType = IRTypesToCXX[arg.Type].CXXName output_file.write("{} {}".format(CType, arg.Name)) if arg.DefaultInitializer: output_file.write(" = {}".format(arg.DefaultInitializer)) if not LastArg: output_file.write(", ") output_file.write(") {\n") # Save NZCV if needed before clobbering NZCV if op.ImplicitFlagClobber: output_file.write("\t\tSaveNZCV(IROps::OP_{});".format(op.Name.upper())) # We gather the "has x87?" flag as we go. This saves the user from # having to keep track of whether they emitted any x87. # Also changes the mmx state to X87. if op.LoweredX87: output_file.write("\t\tRecordX87Use();\n") output_file.write( "\t\tif(MMXState == MMXState_MMX) ChgStateMMX_X87();\n" ) output_file.write("\t\tauto _Op = AllocateOp();\n".format(op.Name, op.Name.upper())) if op.SSAArgNum != 0: for arg in op.Arguments: if arg.IsSSA: output_file.write("\t\t_Op.first->{} = {};\n".format(arg.Name, arg.Name)) if len(op.Arguments) != 0: for arg in op.Arguments: if not arg.Temporary and not arg.IsSSA: output_file.write("\t\t_Op.first->{} = {};\n".format(arg.Name, arg.Name)) assert not (op.HasDest and op.DestSize is None) # Some ops without a destination still need an operating size # Effectively reusing the destination size value for operation size if op.DestSize != None: output_file.write("\t\t_Op.first->Header.Size = {};\n".format(op.DestSize)) if op.ElementSize == None: output_file.write("\t\t_Op.first->Header.ElementSize = _Op.first->Header.Size;\n") else: output_file.write("\t\t_Op.first->Header.ElementSize = {};\n".format(op.ElementSize)) # Only validate here if there's no OrderedNode * version. Else # validation is in that version, see the comment below. if op.SSAArgNum == 0: print_validation(op) output_file.write("\t\treturn _Op;\n") output_file.write("\t}\n\n") # Now do the OrderedNode * version if necessary if op.SSAArgNum: output_file.write("\t///\n".join(["\t/// {}\n" .format(comment) for comment in op.Desc])) output_file.write("\tIRPair _{}(" .format(op.Name, op.Name)) for i, arg in enumerate(op.Arguments): LastArg = i == len(op.Arguments) - 1 if arg.Temporary: CType = IRTypesToCXX[arg.Type].CXXName output_file.write("{} {}".format(CType, arg.Name)) elif arg.IsSSA: output_file.write("OrderedNode *{}".format(arg.Name)) else: CType = IRTypesToCXX[arg.Type].CXXName output_file.write("{} {}".format(CType, arg.Name)) if arg.DefaultInitializer: output_file.write(" = {}".format(arg.DefaultInitializer)) if not LastArg: output_file.write(", ") output_file.write(") {\n") output_file.write("\t\tauto ListDataBegin = DualListData.ListBegin();\n") idx = 0 for arg in op.Arguments: if arg.IsSSA: # Inline an immediate if we can inline = op.Inline[idx] idx += 1 if inline != '': Sized = "Size" in [x.Name for x in op.Arguments] P = ["Size" if Sized else "OpSize::i64Bit", arg.Name] # A few cases need extra info plumbed. if inline == "SubtractZero": P += ["Src2"] elif inline == "Mem": P += ["OffsetType", "OffsetScale"] elif inline == "Memtso": P += ["OffsetType", "OffsetScale", "true /* TSO */"] inline = "Mem" output_file.write(f"\t\t{arg.Name} = Inline{inline}({', '.join(P)});\n") output_file.write(f"\t\t{arg.Name}->AddUse();\n") # Insert validation here. This is skipped for the # OrderedNodeWrapper version because validation can depend on # the OrderedNode, but that's ok in practice. Everything pre-RA # uses the OrderedNode version, and anything RA-onwards is # dubious to validate. print_validation(op) output_file.write(f"\t\treturn _{op.Name}(") for i, arg in enumerate(op.Arguments): LastArg = i == len(op.Arguments) - 1 output_file.write(arg.Name) if arg.IsSSA: output_file.write("->Wrapped(ListDataBegin)") if not LastArg: output_file.write(", ") output_file.write(");\n") output_file.write("\t}\n\n") output_file.write("#undef IROP_ALLOCATE_HELPERS\n") output_file.write("#endif\n") def print_ir_dispatcher_defs(): output_dispatch_file.write("#ifdef IROP_DISPATCH_DEFS\n") for op in IROps: if op.Name != "Last" and op.SwitchGen and op.JITDispatch and op.JITDispatchOverride == None: output_dispatch_file.write("DEF_OP({});\n".format(op.Name)) output_dispatch_file.write("#undef IROP_DISPATCH_DEFS\n") output_dispatch_file.write("#endif\n") def print_ir_dispatcher_dispatch(): output_dispatch_file.write("#ifdef IROP_DISPATCH_DISPATCH\n") for op in IROps: if op.Name != "Last" and op.JITDispatch: DispatchName = op.Name if op.JITDispatchOverride != None: DispatchName = op.JITDispatchOverride if (op.DynamicDispatch): output_dispatch_file.write("REGISTER_OP_RT({}, {});\n".format(op.Name.upper(), DispatchName)) else: output_dispatch_file.write("REGISTER_OP({}, {});\n".format(op.Name.upper(), DispatchName)) output_dispatch_file.write("#undef IROP_DISPATCH_DISPATCH\n") output_dispatch_file.write("#endif\n") if len(sys.argv) < 4: ExitError("Insufficient parameters passed to script") output_filename = sys.argv[2] output_dispatcher_filename = sys.argv[3] json_file = open(sys.argv[1], "r") json_text = json_file.read() json_file.close() json_object = json.loads(json_text) json_object = {k.upper(): v for k, v in json_object.items()} enums = json_object["ENUMS"] ops = json_object["OPS"] irtypes = json_object["IRTYPES"] defines = json_object["DEFINES"] parse_irtypes(irtypes) parse_ops(ops) output_file = open(output_filename, "w") print_enums(enums) print_ir_structs(defines) print_ir_sizes() print_ir_reg_classes() print_ir_getname() print_ir_getraargs() print_ir_hassideeffects() print_ir_gethasdest() print_ir_arg_printer() print_ir_allocator_helpers() output_file.close() output_dispatch_file = open(output_dispatcher_filename, "w") print_ir_dispatcher_defs() print_ir_dispatcher_dispatch() output_dispatch_file.close() ================================================ FILE: FEXCore/Source/CMakeLists.txt ================================================ set(MAN_DIR share/man CACHE PATH "MAN_DIR") set(FEXCORE_BASE_SRCS Interface/Config/Config.cpp Utils/Allocator.cpp Utils/FileLoading.cpp Utils/ForcedAssert.cpp Utils/LogManager.cpp Utils/SpinWaitLock.cpp) if (NOT MINGW) list(APPEND FEXCORE_BASE_SRCS Utils/Allocator/64BitAllocator.cpp) endif() set(SRCS Common/JitSymbols.cpp Interface/Context/Context.cpp Interface/Core/LookupCache.cpp Interface/Core/CodeCache.cpp Interface/Core/Core.cpp Interface/Core/CPUBackend.cpp Interface/Core/Addressing.cpp Interface/Core/CPUID.cpp Interface/Core/Frontend.cpp Interface/Core/OpcodeDispatcher/AVX_128.cpp Interface/Core/OpcodeDispatcher/Crypto.cpp Interface/Core/OpcodeDispatcher/Flags.cpp Interface/Core/OpcodeDispatcher/Vector.cpp Interface/Core/OpcodeDispatcher/X87.cpp Interface/Core/OpcodeDispatcher/X87F64.cpp Interface/Core/OpcodeDispatcher.cpp Interface/Core/ArchHelpers/Arm64Emitter.cpp Interface/Core/Dispatcher/Dispatcher.cpp Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp Interface/Core/Interpreter/Fallbacks/StringCompareFallbacks.cpp Interface/Core/JIT/JIT.cpp Interface/Core/JIT/ALUOps.cpp Interface/Core/JIT/AtomicOps.cpp Interface/Core/JIT/BranchOps.cpp Interface/Core/JIT/ConversionOps.cpp Interface/Core/JIT/EncryptionOps.cpp Interface/Core/JIT/MemoryOps.cpp Interface/Core/JIT/MiscOps.cpp Interface/Core/JIT/MoveOps.cpp Interface/Core/JIT/VectorOps.cpp Interface/Core/JIT/Arm64Relocations.cpp Interface/Core/X86Tables/BaseTables.cpp Interface/Core/X86Tables/DDDTables.cpp Interface/Core/X86Tables/H0F38Tables.cpp Interface/Core/X86Tables/H0F3ATables.cpp Interface/Core/X86Tables/PrimaryGroupTables.cpp Interface/Core/X86Tables/SecondaryGroupTables.cpp Interface/Core/X86Tables/SecondaryModRMTables.cpp Interface/Core/X86Tables/SecondaryTables.cpp Interface/Core/X86Tables/VEXTables.cpp Interface/Core/X86Tables/X87Tables.cpp Interface/GDBJIT/GDBJIT.cpp Interface/IR/IRDumper.cpp Interface/IR/IREmitter.cpp Interface/IR/PassManager.cpp Interface/IR/Passes/IRDumperPass.cpp Interface/IR/Passes/IRValidation.cpp Interface/IR/Passes/RedundantFlagCalculationElimination.cpp Interface/IR/Passes/RegisterAllocationPass.cpp Interface/IR/Passes/x87StackOptimizationPass.cpp Utils/LongJump.cpp Utils/Telemetry.cpp Utils/Threads.cpp Utils/Profiler.cpp) if (ARCHITECTURE_arm64) list(APPEND SRCS Utils/ArchHelpers/Arm64.cpp) else() list(APPEND SRCS Utils/ArchHelpers/Arm64_stubs.cpp) endif() if (ENABLE_GLIBC_ALLOCATOR_HOOK_FAULT) list(APPEND FEXCORE_BASE_SRCS Utils/AllocatorOverride.cpp) endif() set(DEFINES -DJIT_ARM64) if (ARCHITECTURE_x86_64) list(APPEND DEFINES -DARCHITECTURE_x86_64=1) endif() if (ARCHITECTURE_arm64) list(APPEND DEFINES -DARCHITECTURE_arm64=1) endif() if (ENABLE_VIXL_DISASSEMBLER) list(APPEND DEFINES -DVIXL_DISASSEMBLER=1) endif() if (ENABLE_ZYDIS) list(APPEND DEFINES -DZYDIS_DISASSEMBLER=1) endif() if (ARCHITECTURE_arm64 AND HAS_CLANG_PRESERVE_ALL) list(APPEND DEFINES "-DFEXCORE_PRESERVE_ALL_ATTR=__attribute__((preserve_all));-DFEXCORE_HAS_PRESERVE_ALL_ATTR=1") else() list(APPEND DEFINES "-DFEXCORE_PRESERVE_ALL_ATTR=;-DFEXCORE_HAS_PRESERVE_ALL_ATTR=0") endif() set(LIBS fmt::fmt xxHash::xxhash FEXHeaderUtils CodeEmitter cephes_128bit) if (ENABLE_VIXL_DISASSEMBLER OR ENABLE_VIXL_SIMULATOR) list(APPEND LIBS vixl::vixl) endif() if (ENABLE_ZYDIS) list(APPEND LIBS Zydis::Zydis) endif() if (NOT MINGW) list(APPEND LIBS dl) else() list(APPEND LIBS synchronization) if (ARCHITECTURE_arm64ec) list(APPEND LIBS mincore) endif() endif() if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") # GCC requires libatomic to use 128-bit atomics list(APPEND LIBS atomic) endif() # Generate config configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Interface/Config/Config.json.in ${CMAKE_BINARY_DIR}/generated/Config/Config.json) # Generate IR include file set(OUTPUT_IR_FOLDER "${CMAKE_BINARY_DIR}/include/FEXCore/IR") set(OUTPUT_NAME "${OUTPUT_IR_FOLDER}/IRDefines.inc") set(OUTPUT_DISPATCHER_NAME "${OUTPUT_IR_FOLDER}/IRDefines_Dispatch.inc") set(INPUT_NAME "${CMAKE_CURRENT_SOURCE_DIR}/Interface/IR/IR.json") file(MAKE_DIRECTORY "${OUTPUT_IR_FOLDER}") add_custom_command( OUTPUT "${OUTPUT_NAME}" "${OUTPUT_DISPATCHER_NAME}" DEPENDS "${INPUT_NAME}" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/../Scripts/json_ir_generator.py" COMMAND "python3" "${CMAKE_CURRENT_SOURCE_DIR}/../Scripts/json_ir_generator.py" "${INPUT_NAME}" "${OUTPUT_NAME}" "${OUTPUT_DISPATCHER_NAME}") set_source_files_properties(${OUTPUT_NAME} PROPERTIES GENERATED TRUE) # Generate IR documentation set(OUTPUT_IR_DOC "${CMAKE_BINARY_DIR}/IR.md") add_custom_command( OUTPUT "${OUTPUT_IR_DOC}" DEPENDS "${INPUT_NAME}" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/../Scripts/json_ir_doc_generator.py" COMMAND "python3" "${CMAKE_CURRENT_SOURCE_DIR}/../Scripts/json_ir_doc_generator.py" "${INPUT_NAME}" "${OUTPUT_IR_DOC}") set_source_files_properties(${OUTPUT_IR_NAME} PROPERTIES GENERATED TRUE) # Create the target add_custom_target(IR_INC DEPENDS "${OUTPUT_NAME}" DEPENDS "${OUTPUT_IR_DOC}") # Generate the configuration include file set(OUTPUT_CONFIG_FOLDER "${CMAKE_BINARY_DIR}/include/FEXCore/Config") set(OUTPUT_CONFIG_NAME "${OUTPUT_CONFIG_FOLDER}/ConfigValues.inl") set(OUTPUT_CONFIG_OPTION_NAME "${OUTPUT_CONFIG_FOLDER}/ConfigOptions.inl") set(INPUT_CONFIG_NAME "${CMAKE_BINARY_DIR}/generated/Config/Config.json") set(OUTPUT_MAN_NAME "${CMAKE_BINARY_DIR}/generated/FEX.1") set(OUTPUT_MAN_NAME_COMPRESS "${CMAKE_BINARY_DIR}/generated/FEX.1.gz") file(MAKE_DIRECTORY "${OUTPUT_CONFIG_FOLDER}") add_custom_command( OUTPUT "${OUTPUT_CONFIG_NAME}" OUTPUT "${OUTPUT_CONFIG_OPTION_NAME}" OUTPUT "${OUTPUT_MAN_NAME}" DEPENDS "${INPUT_CONFIG_NAME}" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/../Scripts/config_generator.py" COMMAND "python3" "${CMAKE_CURRENT_SOURCE_DIR}/../Scripts/config_generator.py" "${INPUT_CONFIG_NAME}" "${OUTPUT_CONFIG_NAME}" "${OUTPUT_MAN_NAME}" "${OUTPUT_CONFIG_OPTION_NAME}") add_custom_command( OUTPUT "${OUTPUT_MAN_NAME_COMPRESS}" DEPENDS "${OUTPUT_MAN_NAME}" COMMAND "gzip" "-kf9n" "${OUTPUT_MAN_NAME}") set_source_files_properties(${OUTPUT_CONFIG_NAME} PROPERTIES GENERATED TRUE) set_source_files_properties(${OUTPUT_CONFIG_OPTION_NAME} PROPERTIES GENERATED TRUE) set_source_files_properties(${OUTPUT_MAN_NAME} PROPERTIES GENERATED TRUE) set_source_files_properties(${OUTPUT_MAN_NAME_COMPRESS} PROPERTIES GENERATED TRUE) # Create the target add_custom_target(CONFIG_INC DEPENDS "${OUTPUT_CONFIG_NAME}" DEPENDS "${OUTPUT_CONFIG_OPTION_NAME}" DEPENDS "${OUTPUT_MAN_NAME}" DEPENDS "${OUTPUT_MAN_NAME_COMPRESS}") if (NOT BUILD_STEAM_SUPPORT) # Install the compressed man page install(FILES ${OUTPUT_MAN_NAME_COMPRESS} COMPONENT Runtime DESTINATION ${MAN_DIR}/man1) endif() # Add in diagnostic colours if the option is available. # Ninja code generator will kill colours if this isn't here check_cxx_compiler_flag(-fdiagnostics-color=always GCC_COLOR) check_cxx_compiler_flag(-fcolor-diagnostics CLANG_COLOR) function(AddDefaultOptionsToTarget Name) set_target_properties(${Name} PROPERTIES C_VISIBILITY_PRESET hidden) set_target_properties(${Name} PROPERTIES CXX_VISIBILITY_PRESET hidden) set_target_properties(${Name} PROPERTIES VISIBILITY_INLINES_HIDDEN TRUE) target_include_directories(${Name} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}") target_include_directories(${Name} PRIVATE IncludePrivate/) target_include_directories(${Name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/) target_include_directories(${Name} PUBLIC "${PROJECT_SOURCE_DIR}/include/") target_include_directories(${Name} PUBLIC "${CMAKE_BINARY_DIR}/include/") target_compile_definitions(${Name} PRIVATE ${DEFINES}) add_dependencies(${Name} CONFIG_INC IR_INC) target_compile_options(${Name} PRIVATE -Wall -Werror=cast-qual -Werror=ignored-qualifiers -Werror=implicit-fallthrough -Wno-trigraphs -ffunction-sections -fwrapv) if (GCC_COLOR) target_compile_options(${Name} PRIVATE "-fdiagnostics-color=always") endif() if (CLANG_COLOR) target_compile_options(${Name} PRIVATE "-fcolor-diagnostics") endif() LinkerGC(${Name}) target_link_libraries(${Name} PUBLIC unordered_dense::unordered_dense) endfunction() # Build FEXCore_Base static library add_library(FEXCore_Base STATIC ${FEXCORE_BASE_SRCS}) target_link_libraries(FEXCore_Base PUBLIC ${LIBS}) AddDefaultOptionsToTarget(FEXCore_Base) if (ENABLE_FEXCORE_PROFILER AND FEXCORE_PROFILER_BACKEND STREQUAL "TRACY") target_link_libraries(FEXCore_Base PUBLIC TracyClient) endif() function(AddObject Name) add_library(${Name} OBJECT ${SRCS}) target_link_libraries(${Name} PRIVATE FEXCore_Base) target_compile_options(${Name} PRIVATE ${FEX_TUNE_COMPILE_FLAGS}) AddDefaultOptionsToTarget(${Name}) endfunction() function(AddLibrary Name Type) add_library(${Name} ${Type} $) set_target_properties(${Name} PROPERTIES OUTPUT_NAME FEXCore) # During generation of the import library (dll.a), MinGW needs some extra symbols from libraries # such as fmt, which are propagated by FEXCore_Base. Wonderful. if (MINGW) target_link_libraries(${Name} PRIVATE FEXCore_Base) endif() AddDefaultOptionsToTarget(${Name}) endfunction() AddObject(${PROJECT_NAME}_object) AddLibrary(${PROJECT_NAME} STATIC) AddLibrary(${PROJECT_NAME}_shared SHARED) if (NOT MINGW AND NOT BUILD_STEAM_SUPPORT) install(TARGETS ${PROJECT_NAME}_shared LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Libraries) endif() # Meta-library to link jemalloc libraries enabled in the build configuration. # Only needed for targets that run emulation. For others, use JemallocDummy. add_library(JemallocLibs STATIC Utils/AllocatorHooks.cpp) if (ENABLE_FEX_ALLOCATOR) target_compile_definitions(JemallocLibs PRIVATE ENABLE_FEX_ALLOCATOR=1) target_link_libraries(JemallocLibs PUBLIC rpmalloc) endif() if (ENABLE_JEMALLOC_GLIBC_ALLOC) set_source_files_properties(Interface/HLE/Thunks/Thunks.cpp PROPERTIES COMPILE_DEFINITIONS ENABLE_JEMALLOC_GLIBC=1) target_link_libraries(JemallocLibs INTERFACE FEX_jemalloc_glibc) endif() if (NOT MINGW) # Dummy project to use for host tools. # This overrides use of jemalloc in FEXCore with the normal glibc allocator. add_library(JemallocDummy STATIC Utils/AllocatorHooks.cpp) target_include_directories(JemallocDummy PRIVATE "${PROJECT_SOURCE_DIR}/include/") endif() # The shared library should always link enabled jemalloc libraries target_link_libraries(${PROJECT_NAME}_shared PRIVATE JemallocLibs) ================================================ FILE: FEXCore/Source/Common/BitSet.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include #include namespace FEXCore { template struct BitSet final { using ElementType = T; constexpr static size_t MinimumSize = sizeof(ElementType); constexpr static size_t MinimumSizeBits = sizeof(ElementType) * 8; ElementType* Memory; void Allocate(size_t Elements) { size_t AllocateSize = ToBytes(Elements); LOGMAN_THROW_A_FMT((AllocateSize * MinimumSize) >= Elements, "Fail"); Memory = static_cast(FEXCore::Allocator::malloc(AllocateSize)); } void Realloc(size_t Elements) { size_t AllocateSize = ToBytes(Elements); LOGMAN_THROW_A_FMT((AllocateSize * MinimumSize) >= Elements, "Fail"); Memory = static_cast(FEXCore::Allocator::realloc(Memory, AllocateSize)); } void Free() { FEXCore::Allocator::free(Memory); Memory = nullptr; } bool Get(T Element) { return (Memory[Element / MinimumSizeBits] & (1ULL << (Element % MinimumSizeBits))) != 0; } void Set(T Element) { Memory[Element / MinimumSizeBits] |= (1ULL << (Element % MinimumSizeBits)); } void Clear(T Element) { Memory[Element / MinimumSizeBits] &= (1ULL << (Element % MinimumSizeBits)); } void MemClear(size_t Elements) { memset(Memory, 0, ToBytes(Elements)); } void MemSet(size_t Elements) { memset(Memory, 0xFF, ToBytes(Elements)); } uint32_t ToBytes(size_t Elements) { return AlignUp(Elements, MinimumSizeBits) / MinimumSize; } // This very explicitly doesn't let you take an address // Is only a getter bool operator[](T Element) { return Get(Element); } }; template struct BitSetView final { using ElementType = T; constexpr static size_t MinimumSize = sizeof(ElementType); constexpr static size_t MinimumSizeBits = sizeof(ElementType) * 8; ElementType* Memory; void GetView(BitSet& Set, uint64_t ElementOffset) { LOGMAN_THROW_A_FMT((ElementOffset % MinimumSize) == 0, "Bitset view offset needs to be aligned to size of backing element"); Memory = &Set.Memory[ElementOffset / MinimumSizeBits]; } bool Get(T Element) { return (Memory[Element / MinimumSizeBits] & (1ULL << (Element % MinimumSizeBits))) != 0; } void Set(T Element) { Memory[Element / MinimumSizeBits] |= (1ULL << (Element % MinimumSizeBits)); } void Clear(T Element) { Memory[Element / MinimumSizeBits] &= (1ULL << (Element % MinimumSizeBits)); } void MemClear(size_t Elements) { memset(Memory, 0, AlignUp(Elements / MinimumSizeBits, MinimumSizeBits)); } void MemSet(size_t Elements) { memset(Memory, 0xFF, AlignUp(Elements / MinimumSizeBits, MinimumSizeBits)); } // This very explicitly doesn't let you take an address // Is only a getter bool operator[](T Element) { return Get(Element); } }; static_assert(sizeof(BitSet) == sizeof(uintptr_t), "Needs to just be a pointer"); static_assert(std::is_trivially_copyable_v>, "Needs to trivially copyable"); static_assert(sizeof(BitSetView) == sizeof(uintptr_t), "Needs to just be a pointer"); static_assert(std::is_trivially_copyable_v>, "Needs to trivially copyable"); } // namespace FEXCore ================================================ FILE: FEXCore/Source/Common/JitSymbols.cpp ================================================ // SPDX-License-Identifier: MIT #include #include "Common/JitSymbols.h" #include #include namespace FEXCore { JITSymbols::JITSymbols() {} JITSymbols::~JITSymbols() { if (fd != -1) { close(fd); } } void JITSymbols::InitFile() { // We can't use FILE here since we must be robust against forking processes closing our FD from under us. #ifdef __ANDROID__ // Android simpleperf looks in /data/local/tmp instead of /tmp const auto PerfMap = fextl::fmt::format("/data/local/tmp/perf-{}.map", getpid()); #else const auto PerfMap = fextl::fmt::format("/tmp/perf-{}.map", getpid()); #endif fd = open(PerfMap.c_str(), O_CREAT | O_TRUNC | O_WRONLY | O_APPEND, 0644); } void JITSymbols::RegisterNamedRegion(const void* HostAddr, uint32_t CodeSize, std::string_view Name) { if (fd == -1) { return; } // Linux perf format is very straightforward // ` \n` const auto Buffer = fextl::fmt::format("{} {:x} {}\n", HostAddr, CodeSize, Name); auto Result = write(fd, Buffer.c_str(), Buffer.size()); if (Result == -1 && errno == EBADF) { fd = -1; } } void JITSymbols::RegisterJITSpace(const void* HostAddr, uint32_t CodeSize) { if (fd == -1) { return; } // Linux perf format is very straightforward // ` \n` const auto Buffer = fextl::fmt::format("{} {:x} FEXJIT\n", HostAddr, CodeSize); auto Result = write(fd, Buffer.c_str(), Buffer.size()); if (Result == -1 && errno == EBADF) { fd = -1; } } // Buffered JIT symbols. void JITSymbols::Register(FEXCore::JITSymbolBuffer* Buffer, const void* HostAddr, uint64_t GuestAddr, uint32_t CodeSize) { if (fd == -1) { return; } // Calculate remaining sizes. const auto RemainingSize = Buffer->BUFFER_SIZE - Buffer->Offset; const auto CurrentBufferOffset = &Buffer->Buffer[Buffer->Offset]; // Linux perf format is very straightforward // ` \n` const auto FMTResult = fmt::format_to_n(CurrentBufferOffset, RemainingSize, "{} {:x} JIT_0x{:x}_{}\n", HostAddr, CodeSize, GuestAddr, HostAddr); if (FMTResult.out >= &Buffer->Buffer[Buffer->BUFFER_SIZE]) { // Couldn't fit, need to force a write. WriteBuffer(Buffer, true); // Rerun Register(Buffer, HostAddr, GuestAddr, CodeSize); return; } Buffer->Offset += FMTResult.size; WriteBuffer(Buffer); } void JITSymbols::Register(FEXCore::JITSymbolBuffer* Buffer, const void* HostAddr, uint32_t CodeSize, std::string_view Name, uintptr_t Offset) { if (fd == -1) { return; } // Calculate remaining sizes. const auto RemainingSize = Buffer->BUFFER_SIZE - Buffer->Offset; const auto CurrentBufferOffset = &Buffer->Buffer[Buffer->Offset]; // Linux perf format is very straightforward // ` \n` const auto FMTResult = fmt::format_to_n(CurrentBufferOffset, RemainingSize, "{} {:x} {}+0x{:x} ({})\n", HostAddr, CodeSize, Name, Offset, HostAddr); if (FMTResult.out >= &Buffer->Buffer[Buffer->BUFFER_SIZE]) { // Couldn't fit, need to force a write. WriteBuffer(Buffer, true); // Rerun Register(Buffer, HostAddr, CodeSize, Name, Offset); return; } Buffer->Offset += FMTResult.size; WriteBuffer(Buffer); } void JITSymbols::RegisterNamedRegion(FEXCore::JITSymbolBuffer* Buffer, const void* HostAddr, uint32_t CodeSize, std::string_view Name) { if (fd == -1) { return; } // Calculate remaining sizes. const auto RemainingSize = Buffer->BUFFER_SIZE - Buffer->Offset; const auto CurrentBufferOffset = &Buffer->Buffer[Buffer->Offset]; // Linux perf format is very straightforward // ` \n` const auto FMTResult = fmt::format_to_n(CurrentBufferOffset, RemainingSize, "{} {:x} {}\n", HostAddr, CodeSize, Name); if (FMTResult.out >= &Buffer->Buffer[Buffer->BUFFER_SIZE]) { // Couldn't fit, need to force a write. WriteBuffer(Buffer, true); // Rerun RegisterNamedRegion(Buffer, HostAddr, CodeSize, Name); return; } Buffer->Offset += FMTResult.size; WriteBuffer(Buffer); } void JITSymbols::WriteBuffer(FEXCore::JITSymbolBuffer* Buffer, bool ForceWrite) { auto Now = std::chrono::steady_clock::now(); if (!ForceWrite) { if (((Buffer->LastWrite - Now) < Buffer->MAXIMUM_THRESHOLD) && Buffer->Offset < Buffer->NEEDS_WRITE_DISTANCE) { // Still buffering, no need to write. return; } } Buffer->LastWrite = Now; auto Result = write(fd, Buffer->Buffer, Buffer->Offset); if (Result == -1 && errno == EBADF) { fd = -1; } Buffer->Offset = 0; } } // namespace FEXCore ================================================ FILE: FEXCore/Source/Common/JitSymbols.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include namespace FEXCore { // Buffered JIT symbol tracking. struct JITSymbolBuffer { // Maximum buffer size to ensure we are a page in size. constexpr static size_t BUFFER_SIZE = FEXCore::Utils::FEX_PAGE_SIZE - (8 * 2); // Maximum distance until the end of the buffer to do a write. constexpr static size_t NEEDS_WRITE_DISTANCE = BUFFER_SIZE - 64; // Maximum time threshhold to wait before a buffer write occurs. constexpr static std::chrono::milliseconds MAXIMUM_THRESHOLD {100}; JITSymbolBuffer() : LastWrite {std::chrono::steady_clock::now()} {} // stead_clock to ensure a monotonic increasing clock. // In highly stressed situations this can still cause >2% CPU time in vdso_clock_gettime. // If we need lower CPU time when JIT symbols are enabled then FEX can read the cycle counter directly. std::chrono::steady_clock::time_point LastWrite {}; size_t Offset {}; char Buffer[BUFFER_SIZE] {}; }; static_assert(sizeof(JITSymbolBuffer) == FEXCore::Utils::FEX_PAGE_SIZE, "Ensure this is one page in size"); class JITSymbols final { public: JITSymbols(); ~JITSymbols(); void InitFile(); void RegisterNamedRegion(const void* HostAddr, uint32_t CodeSize, std::string_view Name); void RegisterJITSpace(const void* HostAddr, uint32_t CodeSize); // Allocate JIT buffer. static fextl::unique_ptr AllocateBuffer() { return fextl::make_unique(); } void Register(FEXCore::JITSymbolBuffer* Buffer, const void* HostAddr, uint64_t GuestAddr, uint32_t CodeSize); void Register(FEXCore::JITSymbolBuffer* Buffer, const void* HostAddr, uint32_t CodeSize, std::string_view Name, uintptr_t Offset); void RegisterNamedRegion(FEXCore::JITSymbolBuffer* Buffer, const void* HostAddr, uint32_t CodeSize, std::string_view Name); private: int fd {-1}; void WriteBuffer(FEXCore::JITSymbolBuffer* Buffer, bool ForceWrite = false); }; } // namespace FEXCore ================================================ FILE: FEXCore/Source/Common/SoftFloat.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include "cephes_128bit.h" #include #include #include #include #include "Common/VectorRegType.h" extern "C" { #include "SoftFloat-3e/platform.h" #include "SoftFloat-3e/softfloat.h" } struct FEX_PACKED X80SoftFloat { #ifdef ARCHITECTURE_x86_64 // Define this to push some operations to x87 // Only useful to see if precision loss is killing something // #define DEBUG_X86_FLOAT #ifdef DEBUG_X86_FLOAT #define BIGFLOAT long double #define BIGFLOATSIZE 10 #else #define BIGFLOAT float128_t #define BIGFLOATSIZE 16 #endif #elif defined(ARCHITECTURE_arm64) #define BIGFLOAT float128_t #define BIGFLOATSIZE 16 #else #error No 128bit float for this target! #endif uint64_t Significand; union { uint16_t Raw; struct { uint16_t Exponent : 15; uint16_t Sign : 1; }; } Top; X80SoftFloat() { memset(this, 0, sizeof(*this)); } X80SoftFloat(uint16_t _Sign, uint16_t _Exponent, uint64_t _Significand) : Significand {_Significand} , Top {.Raw = static_cast((_Exponent & 0x7FFF) | (_Sign << 15))} {} fextl::string str() const { fextl::ostringstream string; string << std::hex << Top.Sign; string << "_" << Top.Exponent; string << "_" << (Significand >> 63); string << "_" << (Significand & ((1ULL << 63) - 1)); return string.str(); } // Ops FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FADD(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 faddp; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else return extF80_add(state, lhs, rhs); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FSUB(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 fsubp; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else return extF80_sub(state, lhs, rhs); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FMUL(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 fmulp; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else return extF80_mul(state, lhs, rhs); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FDIV(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 fdivp; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else return extF80_div(state, lhs, rhs); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FREM(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #if defined(DEBUG_X86_FLOAT) BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 fprem; fstpt %[result]; ffreep %%st(0); )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else /* * Check for invalid operation cases first - Intel FPREM sets Invalid Operation * for several cases including infinity dividend and zero divisor. */ X80SoftFloat result = 0; if (HandleInfinityOp(state, lhs, result)) { return result; } else if (lhs.Top.Exponent == 0x7FFF && (lhs.Significand & 0x7FFFFFFFFFFFFFFFULL)) { // NaN // propagate NaN state->exceptionFlags |= softfloat_flag_invalid; return lhs; } // Check for zero divisor - fprem(x, 0) is invalid operation if (rhs.Top.Exponent == 0 && rhs.Significand == 0) { state->exceptionFlags |= softfloat_flag_invalid; // Return QNaN result.Top.Sign = 0; result.Top.Exponent = 0x7FFF; result.Significand = 0xC000000000000000ULL; return result; } /* * FPREM is not an IEEE-754 remainder. From the Intel spec: * * Computes the remainder obtained from dividing the value in the ST(0) * register (the dividend) by the value in the ST(1) register (the divisor * or modulus), and stores the result in ST(0). The remainder represents the * following value: * * Remainder := ST(0) − (Q * ST(1)) * * Here, Q is an integer value that is obtained by truncating the * floating-point number quotient of [ST(0) / ST(1)] toward zero. * * We implement this sequence literally. softfloat_round_minMag means * "truncate towards zero". */ extFloat80_t quotient = extF80_div(state, lhs, rhs); extFloat80_t Q = extF80_roundToInt(state, quotient, softfloat_round_minMag, true); bool Q_zero = Q.signif == 0 && (Q.signExp & ~(1 << 15)) == 0; if (Q_zero) { return lhs; } else { return extF80_sub(state, lhs, extF80_mul(state, Q, rhs)); } #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FREM1(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #if defined(DEBUG_X86_FLOAT) BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 fprem1; fstpt %[result]; ffreep %%st(0); )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else return extF80_rem(state, lhs, rhs); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FRNDINT(softfloat_state* state, const X80SoftFloat& lhs) { return extF80_roundToInt(state, lhs, state->roundingMode, false); } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FRNDINT(softfloat_state* state, const X80SoftFloat& lhs, uint_fast8_t RoundMode) { return extF80_roundToInt(state, lhs, RoundMode, false); } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FXTRACT_SIG(const X80SoftFloat& lhs) { #if defined(DEBUG_X86_FLOAT) BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 fxtract; fstpt %[result]; ffreep %%st(0); )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st", "st(1)"); return Result; #else // Zero is a special case, the significand for +/- 0 is +/- zero. if (lhs.Top.Exponent == 0x0 && lhs.Significand == 0x0) { return lhs; } X80SoftFloat Tmp = lhs; Tmp.Top.Exponent = 0x3FFF; Tmp.Top.Sign = lhs.Top.Sign; return Tmp; #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FXTRACT_EXP(const X80SoftFloat& lhs) { #if defined(DEBUG_X86_FLOAT) BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 fxtract; ffreep %%st(0); fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st", "st(1)"); return Result; #else // Zero is a special case, the exponent is always -inf if (lhs.Top.Exponent == 0x0 && lhs.Significand == 0x0) { X80SoftFloat Result(1, 0x7FFFUL, 0x8000'0000'0000'0000UL); return Result; } int32_t TrueExp = lhs.Top.Exponent - ExponentBias; return i32_to_extF80(TrueExp); #endif } FEXCORE_PRESERVE_ALL_ATTR static void FCMP(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs, bool* eq, bool* lt, bool* nan) { *eq = extF80_eq(state, lhs, rhs); *lt = extF80_lt(state, lhs, rhs); // Use IEEE 754 semantics: unordered if neither <, =, nor > is true // This is more reliable than custom NaN detection bool gt = !(*eq) && !(*lt) && extF80_le(state, rhs, lhs); *nan = !(*eq) && !(*lt) && !gt; } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FSCALE(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st1 fldt %[lhs]; # st0 fscale; # st0 = st0 * 2^(rdint(st1)) fstpt %[result]; ffreep %%st(0); )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else extFloat80_t Zero {0, 0}; if (extF80_eq(state, lhs, Zero)) { return lhs; } X80SoftFloat Int = FRNDINT(state, rhs, softfloat_round_minMag); BIGFLOAT Src2_d = Int.ToFMax(state); Src2_d = FEXCore::cephes_128bit::exp2l(Src2_d); X80SoftFloat Src2_X80(state, Src2_d); X80SoftFloat Result = extF80_mul(state, lhs, Src2_X80); return Result; #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat F2XM1(softfloat_state* state, const X80SoftFloat& lhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 f2xm1; # st0 = 2^st(0) - 1 fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st"); return Result; #else auto Src1_d = lhs.ToFMax(state); auto Result = FEXCore::cephes_128bit::exp2l(Src1_d); static const float128_t one {0x0ULL, 0x3fff000000000000ULL}; return X80SoftFloat(state, f128_sub(state, Result, one)); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FYL2X(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[rhs]; # st(1) fldt %[lhs]; # st(0) fyl2x; # st(1) * log2l(st(0)) fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else auto Src1_d = lhs.ToFMax(state); auto Src2_d = rhs.ToFMax(state); auto Tmp = f128_mul(state, Src2_d, FEXCore::cephes_128bit::log2l(Src1_d)); return X80SoftFloat(state, Tmp); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FATAN(softfloat_state* state, const X80SoftFloat& lhs, const X80SoftFloat& rhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; fldt %[rhs]; fpatan; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs), [rhs] "m"(rhs) : "st", "st(1)"); return Result; #else BIGFLOAT Src1_d = lhs.ToFMax(state); BIGFLOAT Src2_d = rhs.ToFMax(state); BIGFLOAT Tmp = FEXCore::cephes_128bit::atan2l(Src1_d, Src2_d); return X80SoftFloat(state, Tmp); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FTAN(softfloat_state* state, const X80SoftFloat& lhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 fptan; ffreep %%st(0); fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st"); return Result; #else X80SoftFloat result; if (HandleInfinityOp(state, lhs, result)) { return result; } BIGFLOAT Src_d = lhs.ToFMax(state); Src_d = FEXCore::cephes_128bit::tanl(Src_d); return X80SoftFloat(state, Src_d); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FSIN(softfloat_state* state, const X80SoftFloat& lhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 fsin; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st"); return Result; #else X80SoftFloat result; if (HandleInfinityOp(state, lhs, result)) { return result; } BIGFLOAT Src_d = lhs.ToFMax(state); Src_d = FEXCore::cephes_128bit::sinl(Src_d); return X80SoftFloat(state, Src_d); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FCOS(softfloat_state* state, const X80SoftFloat& lhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 fcos; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st"); return Result; #else X80SoftFloat result; if (HandleInfinityOp(state, lhs, result)) { return result; } BIGFLOAT Src_d = lhs.ToFMax(state); Src_d = FEXCore::cephes_128bit::cosl(Src_d); return X80SoftFloat(state, Src_d); #endif } FEXCORE_PRESERVE_ALL_ATTR static X80SoftFloat FSQRT(softfloat_state* state, const X80SoftFloat& lhs) { #ifdef DEBUG_X86_FLOAT BIGFLOAT Result; asm(R"( fninit; fldt %[lhs]; # st0 fsqrt; fstpt %[result]; )" : [result] "=m"(Result) : [lhs] "m"(lhs) : "st"); return Result; #else return extF80_sqrt(state, lhs); #endif } float ToF32(softfloat_state* state) const { const float32_t Result = extF80_to_f32(state, *this); return std::bit_cast(Result); } double ToF64(softfloat_state* state) const { const float64_t Result = extF80_to_f64(state, *this); return std::bit_cast(Result); } FEXCore::VectorRegType ToVector() const { FEXCore::VectorRegType Ret {}; memcpy(&Ret, this, sizeof(*this)); return Ret; } BIGFLOAT ToFMax(softfloat_state* state) const { #if BIGFLOATSIZE == 16 const float128_t Result = extF80_to_f128(state, *this); return std::bit_cast(Result); #else BIGFLOAT result {}; memcpy(&result, this, sizeof(result)); return result; #endif } int16_t ToI16(softfloat_state* state) const { auto rv = extF80_to_i32(state, *this, state->roundingMode, false); if (rv > INT16_MAX || rv < INT16_MIN) { ///< Indefinite value for 16-bit conversions. return INT16_MIN; } else { return rv; } } int32_t ToI32(softfloat_state* state) const { return extF80_to_i32(state, *this, state->roundingMode, false); } int64_t ToI64(softfloat_state* state) const { return extF80_to_i64(state, *this, state->roundingMode, false); } uint64_t ToUI64(softfloat_state* state) const { return extF80_to_ui64(state, *this, state->roundingMode, false); } void operator=(const int16_t rhs) { *this = i32_to_extF80(rhs); } void operator=(const int32_t rhs) { *this = i32_to_extF80(rhs); } void operator=(const uint64_t rhs) { *this = ui64_to_extF80(rhs); } #if BIGFLOATSIZE == 10 void operator=(const long double rhs) { memcpy(this, &rhs, sizeof(rhs)); } #endif operator void*() { return reinterpret_cast(this); } X80SoftFloat(extFloat80_t rhs) { Significand = rhs.signif; Top.Raw = rhs.signExp; } X80SoftFloat(softfloat_state* state, const float rhs) { *this = f32_to_extF80(state, std::bit_cast(rhs)); } X80SoftFloat(softfloat_state* state, const double rhs) { *this = f64_to_extF80(state, std::bit_cast(rhs)); } X80SoftFloat(softfloat_state* state, BIGFLOAT rhs) { #if BIGFLOATSIZE == 16 *this = f128_to_extF80(state, std::bit_cast(rhs)); #else *this = std::bit_cast(rhs); #endif } X80SoftFloat(const int16_t rhs) { *this = i32_to_extF80(rhs); } X80SoftFloat(const int32_t rhs) { *this = i32_to_extF80(rhs); } X80SoftFloat(const FEXCore::VectorRegType rhs) { memcpy(this, &rhs, sizeof(*this)); } void operator=(extFloat80_t rhs) { Significand = rhs.signif; Top.Raw = rhs.signExp; } operator FEXCore::VectorRegType() const { return ToVector(); } operator extFloat80_t() const { extFloat80_t Result {}; Result.signif = Significand; Result.signExp = Top.Raw; return Result; } static bool IsNan(const X80SoftFloat& lhs) { return (lhs.Top.Exponent == 0x7FFF) && (lhs.Significand & IntegerBit) && (lhs.Significand & Bottom62Significand); } static bool SignBit(const X80SoftFloat& lhs) { return lhs.Top.Sign; } private: static constexpr uint64_t IntegerBit = (1ULL << 63); static constexpr uint64_t Bottom62Significand = ((1ULL << 62) - 1); static constexpr uint32_t ExponentBias = 16383; // Helper function to check for infinity and set invalid operation flag. // Returns true if infinity is dealt with, false otherwise. FEXCORE_PRESERVE_ALL_ATTR static bool HandleInfinityOp(softfloat_state* state, const X80SoftFloat& arg, X80SoftFloat& result) { if (arg.Top.Exponent == 0x7FFF && arg.Significand == 0x8000000000000000ULL) { state->exceptionFlags |= softfloat_flag_invalid; // Return QNaN. result.Top.Sign = 0; result.Top.Exponent = 0x7FFF; result.Significand = 0xC000000000000000ULL; return true; } return false; } }; static_assert(sizeof(X80SoftFloat) == 10, "tword must be 10bytes in size"); ================================================ FILE: FEXCore/Source/Common/StringConv.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include namespace FEXCore::StrConv { template bool Conv(std::string_view Value, T* Result) { if constexpr (std::is_signed_v) { *Result = static_cast(std::strtoll(Value.data(), nullptr, 0)); } else { *Result = static_cast(std::strtoull(Value.data(), nullptr, 0)); } return true; } template, T>> bool Conv(std::string_view Value, T* Result) { *Result = static_cast(std::strtoull(Value.data(), nullptr, 0)); return true; } inline bool Conv(std::string_view Value, fextl::string* Result) { *Result = Value; return true; } } // namespace FEXCore::StrConv ================================================ FILE: FEXCore/Source/Common/VectorRegType.h ================================================ // SPDX-License-Identifier: MIT #pragma once #ifdef ARCHITECTURE_x86_64 #include #include #else #include #endif namespace FEXCore { struct VectorScalarF64Pair { double val[2]; }; #ifdef ARCHITECTURE_arm64 // Can't use uint8x16_t directly from arm_neon.h here. // Overrides softfloat-3e's defines which causes problems. #ifdef __clang__ using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t; #else using VectorRegType = __attribute__((vector_size(16))) uint8_t; #endif struct VectorRegPairType { VectorRegType val[2]; }; static inline VectorRegPairType MakeVectorRegPair(VectorRegType low, VectorRegType high) { return VectorRegPairType {low, high}; } #elif defined(ARCHITECTURE_x86_64) using VectorRegType = __m128i; using VectorRegPairType = __m256i; static inline VectorRegPairType MakeVectorRegPair(VectorRegType low, VectorRegType high) { return _mm256_set_m128i(high, low); } #endif } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Config/Config.cpp ================================================ // SPDX-License-Identifier: MIT #include "Common/StringConv.h" #include "FEXCore/Utils/EnumUtils.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::Context { class Context; } namespace FEXCore::Config { namespace detail { #define P(x) x #define OPT_BASE(type, group, enum, json, default) const P(type) P(enum) = P(default); #define OPT_STR(group, enum, json, default) const std::string_view P(enum) = P(default); #define OPT_STRARRAY(group, enum, json, default) OPT_STR(group, enum, json, default) #define OPT_STRENUM(group, enum, json, default) const uint64_t P(enum) = FEXCore::ToUnderlying(P(default)); #include } // namespace detail enum Paths { PATH_DATA_DIR_LOCAL = 0, PATH_DATA_DIR_GLOBAL, PATH_CONFIG_DIR_LOCAL, PATH_CONFIG_DIR_GLOBAL, PATH_CONFIG_FILE_LOCAL, PATH_CONFIG_FILE_GLOBAL, PATH_CONFIG_TELEMETRY_FOLDER, PATH_LAST, }; static std::array Paths; void SetDataDirectory(const std::string_view Path, bool Global) { Paths[PATH_DATA_DIR_LOCAL + Global] = Path; } void SetConfigDirectory(const std::string_view Path, bool Global) { Paths[PATH_CONFIG_DIR_LOCAL + Global] = Path; } void SetConfigFileLocation(const std::string_view Path, bool Global) { Paths[PATH_CONFIG_FILE_LOCAL + Global] = Path; } const fextl::string& GetTelemetryDirectory() { auto& Path = Paths[PATH_CONFIG_TELEMETRY_FOLDER]; if (Path.empty()) { FEX_CONFIG_OPT(TelemetryDirectory, TELEMETRYDIRECTORY); if (!TelemetryDirectory().empty()) { Path = TelemetryDirectory; Path += "/"; } else { Path = Config::GetDataDirectory(false) + "Telemetry/"; } } return Path; } const fextl::string& GetDataDirectory(bool Global) { return Paths[PATH_DATA_DIR_LOCAL + Global]; } const fextl::string& GetConfigDirectory(bool Global) { return Paths[PATH_CONFIG_DIR_LOCAL + Global]; } const fextl::string& GetConfigFileLocation(bool Global) { return Paths[PATH_CONFIG_FILE_LOCAL + Global]; } fextl::string GetApplicationConfig(const std::string_view Program, bool Global) { fextl::string ConfigFile = GetConfigDirectory(Global); if (!Global && !FHU::Filesystem::Exists(ConfigFile) && !FHU::Filesystem::CreateDirectories(ConfigFile)) { LogMan::Msg::DFmt("Couldn't create config directory: '{}'", ConfigFile); // Let's go local in this case return fextl::fmt::format("./{}.json", Program); } ConfigFile += "AppConfig/"; // Attempt to create the local folder if it doesn't exist if (!Global && !FHU::Filesystem::Exists(ConfigFile) && !FHU::Filesystem::CreateDirectories(ConfigFile)) { // Let's go local in this case return fextl::fmt::format("./{}.json", Program); } return fextl::fmt::format("{}{}.json", ConfigFile, Program); } static fextl::map> ConfigLayers; class MetaLayer; static FEXCore::Config::MetaLayer* Meta {}; constexpr std::array LoadOrder = { FEXCore::Config::LayerType::LAYER_GLOBAL_MAIN, FEXCore::Config::LayerType::LAYER_MAIN, FEXCore::Config::LayerType::LAYER_GLOBAL_STEAM_APP, FEXCore::Config::LayerType::LAYER_GLOBAL_APP, FEXCore::Config::LayerType::LAYER_LOCAL_STEAM_APP, FEXCore::Config::LayerType::LAYER_LOCAL_APP, FEXCore::Config::LayerType::LAYER_ARGUMENTS, FEXCore::Config::LayerType::LAYER_USER_OVERRIDE, FEXCore::Config::LayerType::LAYER_ENVIRONMENT, FEXCore::Config::LayerType::LAYER_TOP}; Layer::Layer(const LayerType _Type) : Type {_Type} {} Layer::~Layer() {} class MetaLayer final : public FEXCore::Config::Layer { public: MetaLayer(const LayerType _Type) : FEXCore::Config::Layer(_Type) {} ~MetaLayer() {} void Load(); template requires (!std::is_same_v && !std::is_same_v) std::optional GetConv(ConfigOption Option) { const auto it = OptionMap.find(Option); if (it == OptionMap.end()) { return std::nullopt; } const auto& Value = it->second; LOGMAN_THROW_A_FMT(!std::holds_alternative(Value), "Tried to get config of invalid type!"); if (std::holds_alternative(Value)) [[likely]] { return std::get(Value); } T ConvertedValue; if (std::holds_alternative(Value)) { const auto& StrVal = std::get(Value); if (FEXCore::StrConv::Conv(StrVal, &ConvertedValue)) { // Convert the value. OptionMap[Option].emplace(ConvertedValue); return ConvertedValue; } else { LOGMAN_MSG_A_FMT("Couldn't Convert {} to specified type!", StrVal); } } FEX_UNREACHABLE; } private: void MergeConfigMap(const LayerOptions& Options); void MergeEnvironmentVariables(const ConfigOption& Option, const StringArrayType& Value); }; void MetaLayer::Load() { OptionMap.clear(); for (auto CurrentLayer = LoadOrder.begin(); CurrentLayer != LoadOrder.end(); ++CurrentLayer) { auto it = ConfigLayers.find(*CurrentLayer); if (it != ConfigLayers.end() && *CurrentLayer != Type) { // Merge this layer's options to this layer MergeConfigMap(it->second->GetOptionMap()); } } } void MetaLayer::MergeEnvironmentVariables(const ConfigOption& Option, const StringArrayType& Value) { // Environment variables need a bit of additional work // We want to merge the arrays rather than overwrite entirely auto MetaEnvironment = OptionMap.find(Option); if (MetaEnvironment == OptionMap.end()) { // Doesn't exist, just insert OptionMap.insert_or_assign(Option, Value); return; } // If an environment variable exists in both current meta and in the incoming layer then the meta layer value is overwritten fextl::unordered_map LookupMap; const auto AddToMap = [&LookupMap](const StringArrayType& Value) { for (const auto& EnvVar : Value) { const auto ItEq = EnvVar.find_first_of('='); if (ItEq == fextl::string::npos) { // Broken environment variable // Skip continue; } auto Key = fextl::string(EnvVar.begin(), EnvVar.begin() + ItEq); auto Value = fextl::string(EnvVar.begin() + ItEq + 1, EnvVar.end()); // Add the key to the map, overwriting whatever previous value was there LookupMap.insert_or_assign(std::move(Key), std::move(Value)); } }; AddToMap(std::get(MetaEnvironment->second)); AddToMap(Value); // Now with the two layers merged in the map // Add all the values to the option Erase(Option); for (auto& Val : LookupMap) { // Set will emplace multiple options in to its list AppendStrArrayValue(Option, Val.first + "=" + Val.second); } } void MetaLayer::MergeConfigMap(const LayerOptions& Options) { // Insert this layer's options, overlaying previous options that exist here for (auto& it : Options) { if (it.first == FEXCore::Config::ConfigOption::CONFIG_ENV || it.first == FEXCore::Config::ConfigOption::CONFIG_HOSTENV) { LOGMAN_THROW_A_FMT(std::holds_alternative(it.second), "Tried to get config of invalid type!"); MergeEnvironmentVariables(it.first, std::get(it.second)); } else { OptionMap.insert_or_assign(it.first, it.second); } } } void Initialize() { AddLayer(fextl::make_unique(FEXCore::Config::LayerType::LAYER_TOP)); Meta = dynamic_cast(ConfigLayers.begin()->second.get()); } void Shutdown() { ConfigLayers.clear(); Meta = nullptr; } void Load() { for (auto CurrentLayer = LoadOrder.begin(); CurrentLayer != LoadOrder.end(); ++CurrentLayer) { auto it = ConfigLayers.find(*CurrentLayer); if (it != ConfigLayers.end()) { it->second->Load(); } } } fextl::string ExpandPath(const fextl::string& ContainerPrefix, const fextl::string& PathName) { if (PathName.empty()) { return {}; } // Expand home if it exists if (FHU::Filesystem::IsRelative(PathName)) { fextl::string Home = getenv("HOME") ?: ""; // Home expansion only works if it is the first character // This matches bash behaviour if (PathName.starts_with("~/")) { Home.append(PathName.begin() + 1, PathName.end()); return Home; } // Expand relative path to absolute char ExistsTempPath[PATH_MAX]; char* RealPath = FHU::Filesystem::Absolute(PathName.c_str(), ExistsTempPath); if (RealPath && FHU::Filesystem::Exists(RealPath)) { return RealPath; } // Only return if it exists if (FHU::Filesystem::Exists(PathName)) { return PathName; } } else { // If the containerprefix and pathname isn't empty // Then we check if the pathname exists in our current namespace // If the path DOESN'T exist but DOES exist with the prefix applied // then redirect to the prefix // // This might not be expected behaviour for some edge cases but since // all paths aren't mounted inside the container, then it'll be fine // // Main catch case for this is the default thunk install folders // HostThunks: $CMAKE_INSTALL_PREFIX/lib/fex-emu/HostThunks/ // GuestThunks: $CMAKE_INSTALL_PREFIX/share/fex-emu/GuestThunks/ if (!ContainerPrefix.empty() && !PathName.empty()) { if (!FHU::Filesystem::Exists(PathName)) { auto ContainerPath = ContainerPrefix + PathName; if (FHU::Filesystem::Exists(ContainerPath)) { return ContainerPath; } } } } return {}; } constexpr char ContainerManager[] = "/run/host/container-manager"; fextl::string FindContainer() { // We only support pressure-vessel at the moment if (FHU::Filesystem::Exists(ContainerManager)) { fextl::string Manager {}; if (FEXCore::FileLoading::LoadFile(Manager, ContainerManager)) { // Trim the whitespace, may contain a newline return FEXCore::StringUtils::Trim(Manager); } } return {}; } fextl::string FindContainerPrefix() { // We only support pressure-vessel at the moment if (FHU::Filesystem::Exists(ContainerManager)) { fextl::string Manager {}; if (FEXCore::FileLoading::LoadFile(Manager, ContainerManager)) { // Trim the whitespace, may contain a newline if (FEXCore::StringUtils::Trim(Manager) == "pressure-vessel") { // We are running inside of pressure vessel // Our $CMAKE_INSTALL_PREFIX paths are now inside of /run/host/$CMAKE_INSTALL_PREFIX return "/run/host/"; } } } return {}; } void ReloadMetaLayer() { Meta->Load(); const fextl::string ContainerPrefix {FindContainerPrefix()}; auto ExpandPathIfExists = [&ContainerPrefix](FEXCore::Config::ConfigOption Config, const fextl::string& PathName) { const auto NewPath = ExpandPath(ContainerPrefix, PathName); if (!NewPath.empty()) { FEXCore::Config::Set(Config, NewPath); } }; if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_ROOTFS)) { const auto PathName = *Meta->Get(FEXCore::Config::CONFIG_ROOTFS); const auto ExpandedString = ExpandPath(ContainerPrefix, *PathName); if (!ExpandedString.empty()) { // Adjust the path if it ended up being relative FEXCore::Config::Set(FEXCore::Config::CONFIG_ROOTFS, ExpandedString); } else if (!PathName->empty()) { // If the filesystem doesn't exist then let's see if it exists in the fex-emu folder const auto PathNameCopy = *PathName; for (auto Global : {true, false}) { for (auto DirectoryFetchers : {GetDataDirectory, GetConfigDirectory}) { fextl::string NamedRootFS = DirectoryFetchers(Global) + "RootFS/" + PathNameCopy; if (FHU::Filesystem::Exists(NamedRootFS)) { FEXCore::Config::Set(FEXCore::Config::CONFIG_ROOTFS, NamedRootFS); } } } } } if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_THUNKHOSTLIBS)) { const auto PathName = *Meta->Get(FEXCore::Config::CONFIG_THUNKHOSTLIBS); ExpandPathIfExists(FEXCore::Config::CONFIG_THUNKHOSTLIBS, *PathName); } if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_THUNKGUESTLIBS)) { const auto PathName = *Meta->Get(FEXCore::Config::CONFIG_THUNKGUESTLIBS); ExpandPathIfExists(FEXCore::Config::CONFIG_THUNKGUESTLIBS, *PathName); } if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_THUNKCONFIG)) { const auto PathName = *Meta->Get(FEXCore::Config::CONFIG_THUNKCONFIG); const auto ExpandedString = ExpandPath(ContainerPrefix, *PathName); if (!ExpandedString.empty()) { // Adjust the path if it ended up being relative FEXCore::Config::Set(FEXCore::Config::CONFIG_THUNKCONFIG, ExpandedString); } else if (!PathName->empty()) { // If the filesystem doesn't exist then let's see if it exists in the fex-emu folder const auto PathNameCopy = *PathName; for (auto Global : {true, false}) { for (auto DirectoryFetchers : {GetDataDirectory, GetConfigDirectory}) { fextl::string NamedConfig = DirectoryFetchers(Global) + "ThunkConfigs/" + PathNameCopy; if (FHU::Filesystem::Exists(NamedConfig)) { FEXCore::Config::Set(FEXCore::Config::CONFIG_THUNKCONFIG, NamedConfig); } } } } } if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_OUTPUTLOG)) { const auto PathName = *Meta->Get(FEXCore::Config::CONFIG_OUTPUTLOG); if (*PathName != "stdout" && *PathName != "stderr" && *PathName != "server") { ExpandPathIfExists(FEXCore::Config::CONFIG_OUTPUTLOG, *PathName); } } if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_DUMPIR) && !FEXCore::Config::Exists(FEXCore::Config::CONFIG_PASSMANAGERDUMPIR)) { // If DumpIR is set but no PassManagerDumpIR configuration is set, then default to `afteropt` const auto PathName = *Meta->Get(FEXCore::Config::CONFIG_DUMPIR); if (*PathName != "no") { Set(FEXCore::Config::ConfigOption::CONFIG_PASSMANAGERDUMPIR, fextl::fmt::format("{}", static_cast(FEXCore::Config::PassManagerDumpIR::AFTEROPT))); } } if (FEXCore::Config::Exists(FEXCore::Config::CONFIG_SINGLESTEP) && Meta->GetConv(FEXCore::Config::CONFIG_SINGLESTEP).value_or(false)) { // Single stepping also enforces single instruction size blocks Set(FEXCore::Config::ConfigOption::CONFIG_MAXINST, "1"); } } void AddLayer(fextl::unique_ptr _Layer) { ConfigLayers.emplace(_Layer->GetLayerType(), std::move(_Layer)); } bool Exists(ConfigOption Option) { return Meta->OptionExists(Option); } std::optional All(ConfigOption Option) { return Meta->All(Option); } std::optional Get(ConfigOption Option) { return Meta->Get(Option); } template std::optional GetConv(ConfigOption Option) { return Meta->GetConv(Option); } template std::optional GetConv(ConfigOption Option); template std::optional GetConv(ConfigOption Option); template std::optional GetConv(ConfigOption Option); template std::optional GetConv(ConfigOption Option); template std::optional GetConv(ConfigOption Option); void Set(ConfigOption Option, std::string_view Data) { Meta->Set(Option, Data); } void Erase(ConfigOption Option) { Meta->Erase(Option); } template T Value::GetIfExists(FEXCore::Config::ConfigOption Option, T Default) { auto Value = FEXCore::Config::GetConv(Option); if (Value) { return *Value; } return Default; } template<> fextl::string Value::GetIfExists(FEXCore::Config::ConfigOption Option, fextl::string Default) { auto Value = FEXCore::Config::Get(Option); if (Value) { return **Value; } else { return Default; } } template<> fextl::string Value::GetIfExists(FEXCore::Config::ConfigOption Option, std::string_view Default) { auto Value = FEXCore::Config::Get(Option); if (Value) { return **Value; } else { return fextl::string(Default); } } template bool Value::GetIfExists(FEXCore::Config::ConfigOption Option, bool Default); template int8_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, int8_t Default); template uint8_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, uint8_t Default); template int16_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, int16_t Default); template uint16_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, uint16_t Default); template int32_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, int32_t Default); template uint32_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, uint32_t Default); template int64_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, int64_t Default); template uint64_t Value::GetIfExists(FEXCore::Config::ConfigOption Option, uint64_t Default); // Constructor template Value::Value(FEXCore::Config::ConfigOption _Option, fextl::string Default); template Value::Value(FEXCore::Config::ConfigOption _Option, bool Default); template Value::Value(FEXCore::Config::ConfigOption _Option, uint8_t Default); template Value::Value(FEXCore::Config::ConfigOption _Option, uint64_t Default); template void Value::GetListIfExists(FEXCore::Config::ConfigOption Option, StringArrayType* List) { auto Value = FEXCore::Config::All(Option); List->clear(); if (Value) { *List = **Value; } } template void Value::GetListIfExists(FEXCore::Config::ConfigOption Option, StringArrayType* List); } // namespace FEXCore::Config ================================================ FILE: FEXCore/Source/Interface/Config/Config.json.in ================================================ { "Options": { "CPU": { "Multiblock": { "Type": "bool", "Default": "true", "Desc": [ "Controls multiblock code compilation", "Can cause long JIT compilation times and stutter" ] }, "MaxInst": { "Type": "int32", "Default": "5000", "Desc": [ "Maximum number of instruction to store in a block" ] }, "EnableCodeCachingWIP": { "Type": "bool", "Default": "false", "Desc": [ "Enable the code caching subsystem" ] }, "EnableCodeCacheValidation": { "Type": "bool", "Default": "false", "Desc": [ "Enable expensive validation when loading code caches" ] }, "HostFeatures": { "Type": "strenum", "Default": "FEXCore::Config::HostFeatures::OFF", "Enums": { "ENABLESVE": "enablesve", "DISABLESVE": "disablesve", "ENABLEAVX": "enableavx", "DISABLEAVX": "disableavx", "ENABLEAFP": "enableafp", "DISABLEAFP": "disableafp", "ENABLELRCPC": "enablelrcpc", "DISABLELRCPC": "disablelrcpc", "ENABLELRCPC2": "enablelrcpc2", "DISABLELRCPC2": "disablelrcpc2", "ENABLECSSC": "enablecssc", "DISABLECSSC": "disablecssc", "ENABLEPMULL128": "enablepmull128", "DISABLEPMULL128": "disablepmull128", "ENABLERNG": "enablerng", "DISABLERNG": "disablerng", "ENABLECLZERO": "enableclzero", "DISABLECLZERO": "disableclzero", "ENABLEATOMICS": "enableatomics", "DISABLEATOMICS": "disableatomics", "ENABLEFCMA": "enablefcma", "DISABLEFCMA": "disablefcma", "ENABLEFLAGM": "enableflagm", "DISABLEFLAGM": "disableflagm", "ENABLEFLAGM2": "enableflagm2", "DISABLEFLAGM2": "disableflagm2", "ENABLEFRINTTS": "enablefrintts", "DISABLEFRINTTS": "disablefrintts", "ENABLECRYPTO": "enablecrypto", "DISABLECRYPTO": "disablecrypto", "ENABLERPRES": "enablerpres", "DISABLERPRES": "disablerpres", "ENABLESVEBITPERM": "enablesvebitperm", "DISABLESVEBITPERM": "disablesvebitperm", "ENABLEPRESERVEALLABI": "enablepreserveallabi", "DISABLEPRESERVEALLABI": "disablepreserveallabi", "ENABLEWFXT": "enablewfxt", "DISABLEWFXT": "disablewfxt", "ENABLE3DNOW": "enable3dnow", "DISABLE3DNOW": "disable3dnow", "ENABLESSE4A": "enablesse4a", "DISABLESSE4A": "disablesse4a", "ENABLEMOPS": "enablemops", "DISABLEMOPS": "disablemops" }, "Desc": [ "Allows controlling of the CPU features in the JIT.", "\toff: Default CPU features queried from CPU features", "\t{enable,disable}sve: Will force enable or disable sve even if the host doesn't support it", "\t{enable,disable}avx: Will force enable or disable avx even if the host doesn't support it", "\t{enable,disable}afp: Will force enable or disable afp even if the host doesn't support it", "\t{enable,disable}lrcpc: Will force enable or disable lrcpc even if the host doesn't support it", "\t{enable,disable}lrcpc2: Will force enable or disable lrcpc2 even if the host doesn't support it", "\t{enable,disable}cssc: Will force enable or disable cssc even if the host doesn't support it", "\t{enable,disable}pmull128: Will force enable or disable pmull128 even if the host doesn't support it", "\t{enable,disable}rng: Will force enable or disable rng even if the host doesn't support it", "\t{enable,disable}clzero: Will force enable or disable clzero even if the host doesn't support it", "\t{enable,disable}atomics: Will force enable or disable ARMv8.1 LSE atomics even if the host doesn't support it", "\t{enable,disable}fcma: Will force enable or disable fcma even if the host doesn't support it", "\t{enable,disable}flagm: Will force enable or disable flagm even if the host doesn't support it", "\t{enable,disable}flagm2: Will force enable or disable flagm2 even if the host doesn't support it", "\t{enable,disable}crypto: Will force enable or disable crypto extensions even if the host doesn't support it", "\t{enable,disable}rpres: Will force enable or disable rpres even if the host doesn't support it", "\t{enable,disable}svebitperm: Will force enable or disable svebitperm even if the host doesn't support it", "\t{enable,disable}preserveallabi: Will force enable or disable preserve_all abi even if the host doesn't support it", "\t{enable,disable}wfxt: Will force enable or disable wfxt even if the host doesn't support it", "\t{enable,disable}3dnow: Will force enable or disable 3DNow! even if the host doesn't support it", "\t{enable,disable}sse4a: Will force enable or disable SSE4a even if the host doesn't support it", "\t{enable,disable}mops: Will force enable or disable FEAT_MOPS even if the host doesn't support it" ] }, "SmallTSCScale": { "Type": "bool", "Default": "true", "Desc": [ "Scales the cycle counter on systems that have low frequencies." ] }, "HideHybrid": { "Type": "bool", "Default": "true", "Desc": [ "Hides hybrid CPU core arrangement." ] }, "CPUFeatureRegisters": { "Type": "str", "Default": "", "Desc": [ "Allows overriding cpu feature flags for manual testing" ] } }, "Emulation": { "RootFS": { "Type": "str", "Default": "", "Desc": [ "Which Root filesystem prefix to use", "This can be a filesystem path", "\teg: ~/RootFS/Debian_x86_64", "Or this can be a name of a rootfs", "If the named rootfs exists in the FEX data folder then it will use that one", "\teg: $XDG_DATA_HOME/fex-emu/RootFS//", "If XDG_DATA_HOME is unset, ~/.local/share will be used in its place.", "\teg: $HOME/.local/share/fex-emu/RootFS//" ] }, "ThunkHostLibs": { "Type": "str", "Default": "@CMAKE_INSTALL_FULL_LIBDIR@/fex-emu/HostThunks", "Desc": [ "Folder to find the host-side thunking libraries." ] }, "ThunkGuestLibs": { "Type": "str", "Default": "@CMAKE_INSTALL_PREFIX@/share/fex-emu/GuestThunks", "Desc": [ "Folder to find the guest-side thunking libraries." ] }, "ThunkConfig": { "Type": "str", "Default": "", "Desc": [ "A json file specifying where to overlay the thunks.", "This can be a filesystem path", "\teg: ~/MyThunkConfig.json", "Or this can be a named of a Thunk config file", "If the named config file exists in the FEX data folder folder the it will use that one", "\teg: $XDG_DATA_HOME/fex-emu/ThunkConfigs/", "If XDG_DATA_HOME is unset, ~/.local/share will be used in its place.", "\teg: $HOME/.local/share/fex-emu/ThunkConfigs/" ] }, "Env": { "Type": "strarray", "Default": "", "Desc": [ "Adds an environment variable to the emulated environment." ] }, "HostEnv": { "Type": "strarray", "Default": "", "Desc": [ "Adds an environment variable to the host environment.", "This can be useful for setting environment variables that thunks can pick up.", "Typically isn't necessary since the guest libc isn't thunked. But is possible." ] }, "AdditionalArguments": { "Type": "strarray", "Default": "", "Desc": [ "Allows the user to pass additional arguments to the application" ] }, "DisableL2Cache": { "Type": "bool", "Default": "true", "Desc": [ "Disables FEXCore's JIT L2 cache lookup. Saving memory.", "Can potentially introduce more stutters." ] }, "DynamicL1Cache": { "Type": "bool", "Default": "true", "Desc": [ "Switches FEXCore's JIT L1 cache to be dynamically sized. Saving memory.", "Can potentially introduce more stutters." ] }, "DynamicL1CacheIncreaseCountHeuristic": { "Type": "uint64", "Default": "250", "Desc": [ "Threshold of lookups per second that the L1 dynamic cache should increase its size.", "Lower numbers means more aggressive scaling upward to the maximum size.", "Higher numbers means more conservative scaling, using less memory.", "Can potentially introduce stutters, more likely the higher the number.", "Don't have this number smaller than the decrease count!" ] }, "DynamicL1CacheDecreaseCountHeuristic": { "Type": "uint64", "Default": "50", "Desc": [ "Threshold of lookups per second that the L1 dynamic cache should decrease its size.", "The higher the number, the more aggressively it reduces the L1 cache size.", "Lower numbers means more conservative memory savings.", "Can potentially introduce more stutters, more likely the higher the number.", "Don't have this number larger than the increase count!" ] } }, "Debug": { "SingleStep": { "Type": "bool", "Default": "false", "Desc": [ "Single stepping configuration." ] }, "GdbServer": { "Type": "bool", "Default": "false", "Desc": [ "Enables the GDB server." ] }, "DumpIR": { "Type": "str", "Default": "no", "Desc": [ "Folder to dump the IR in to.", "[no, stdout, stderr, server, ]" ] }, "PassManagerDumpIR": { "Type": "strenum", "Default": "FEXCore::Config::PassManagerDumpIR::OFF", "Enums": { "BEFOREOPT": "beforeopt", "AFTEROPT": "afteropt", "BEFOREPASS": "beforepass", "AFTERPASS": "afterpass" }, "Desc": [ "Allows controlling when FEX dumps its IR.", "\toff: IR dumping will be disabled", "\tbeforeopt: Dump IR before any optimizations", "\tafteropt: Dump IR after all optimizations", "\tbeforepass: Dump IR before every optimization pass", "\tafterpass: Dump IR after every optimization pass" ] }, "DumpGPRs": { "Type": "bool", "Default": "false", "Desc": [ "When the test harness ends, print the GPR state." ] }, "O0": { "Type": "bool", "Default": "false", "Desc": [ "Disables optimizations passes for debugging." ] }, "GlobalJITNaming": { "Type": "bool", "Default": "false", "Desc": [ "Uses JITSymbols to name all JIT state as one symbol", "Useful for querying how much time is spent inside of the JIT", "Profiling tools will show JIT time as FEXJIT" ] }, "LibraryJITNaming": { "Type": "bool", "Default": "false", "Desc": [ "Uses JITSymbols to name JIT symbols grouped by library", "Useful for querying how much time is spent in each guest library", "Can be used to help guide thunk generation" ] }, "BlockJITNaming": { "Type": "bool", "Default": "false", "Desc": [ "Uses JITSymbols to name JIT symbols", "Useful for determining hot blocks of code", "Has some file writing overhead per JIT block" ] }, "GDBSymbols": { "Type": "bool", "Default": "false", "Desc": [ "Integrates with GDB using the JIT interface.", "Needs the fex jit loader in GDB, which can be loaded via `jit-reader-load libFEXGDBReader.so.`", "Also needs x86_64-linux-gnu-objdump in PATH.", "Can be very slow." ] }, "InjectLibSegFault": { "Type": "bool", "Default": "false", "Desc": [ "Sets the environment variable LD_PRELOAD=libSegFault.so", "This allows the user to very easily enable libSegFault without dealing with environment variables", "Very useful for applications that have launch scripts that set the variable to nothing at launch", "Set this in an application configuration for injecting in to only specific applications.", "\tNote: If x86/x86_64 libSegFault.so isn't installed then this option won't work." ] }, "Disassemble": { "Type": "strenum", "Default": "FEXCore::Config::Disassemble::OFF", "Enums": { "DISPATCHER": "dispatcher", "BLOCKS": "blocks", "STATS": "stats" }, "Desc": [ "Allows controlling of the vixl disassembler for generated ARM code.", "\toff: No disassembly will be output", "\tdispatcher: Will enable disassembly of the JIT dispatcher loop", "\tblocks: Will enable disassembly of the translated instruction code blocks", "\tstats: Will print stats when disassembling the code" ] }, "X86Disassemble": { "Type": "bool", "Default": "false", "Desc": [ "Enables x86/x86-64 guest disassembly output for compiled blocks.", "Requires FEX to be built with -DENABLE_ZYDIS=TRUE" ] }, "ForceSVEWidth": { "Type": "uint32", "Default": "0", "Desc": [ "Allows overriding the SVE width in the vixl simulator.", "Useful as a debugging feature." ] }, "DisableTelemetry": { "Type": "bool", "Default": "false", "Desc": [ "Disables telemetry at runtime.", "Useful for CI instcountCI mostly" ] } }, "Logging": { "SilentLog": { "Type": "bool", "Default": "true", "Desc": [ "Disables logging" ] }, "OutputLog": { "Type": "str", "Default": "server", "Desc": [ "File to write FEX output to.", "[stderr, server, ]" ] }, "TelemetryDirectory": { "Type": "str", "Default": "", "Desc": [ "Redirects the telemetry folder that FEX usually writes to.", "By default telemetry data is stored in {$FEX_APP_DATA_LOCATION,{$XDG_DATA_HOME,$HOME}/fex-emu/Telemetry/}" ] }, "ProfileStats": { "Type": "bool", "Default": "false", "Desc": [ "Enables FEX's low-overhead sampling profile statistics.", "Requires a supported version of Mangohud to see the results" ] }, "EnableGpuvisProfiling": { "Type": "bool", "Default": "false", "Desc": [ "Enables profiling when FEX was built with the gpuvis profiler backend." ] } }, "Hacks": { "SMCChecks": { "Type": "uint8", "Default": "FEXCore::Config::CONFIG_SMC_MTRACK", "TextDefault": "mtrack", "ArgumentHandler": "SMCCheckHandler", "Desc": [ "Checks code for modification before execution.", "\tnone: No checks", "\tmtrack: Page tracking based invalidation (default)", "\tfull: Validate code before every run (slow)" ] }, "TSOEnabled": { "Type": "bool", "Default": "true", "Desc": [ "Controls TSO IR ops.", "Highly likely to break any multithreaded application if disabled." ] }, "VectorTSOEnabled": { "Type": "bool", "Default": "false", "Desc": [ "When TSO emulation is enabled, controls if vector loadstores should also be atomic." ] }, "MemcpySetTSOEnabled": { "Type": "bool", "Default": "false", "Desc": [ "When TSO emulation is enabled, controls if memcpy and memset should also be atomic.", "Only affects REP MOVS and REP STOS instructions" ] }, "HalfBarrierTSOEnabled": { "Type": "bool", "Default": "true", "Desc": [ "When TSO emulation is enabled, controls if unaligned loads and stores should be backpatched to half-barrier atomics.", "Can be dangerous due to aligned loadstores through the same code now become non-atomic." ] }, "StrictInProcessSplitLocks": { "Type": "bool", "Default": "false", "Desc": [ "Strict global lock when handling an unaligned atomic that crosses a 16-byte or cacheline granularity", "This is required to ensure a split-lock doesn't tear inside the process" ] }, "KernelUnalignedAtomicBackpatching": { "Type": "bool", "Default": "true", "Desc": [ "When the kernel unaligned atomic handler is enabled, use backpatching to reduce kernel context switches." ] }, "VolatileMetadata": { "Type": "bool", "Default": "true", "Desc": [ "Use volatile metadata in PE files to inform TSO instructions when available.", "When metadata is unavailable falls back to the currently enabled TSO options." ] }, "X87ReducedPrecision": { "Type": "bool", "Default": "false", "Desc": [ "Emulates X87 floating point using 64-bit precision. This reduces emulation accuracy and may result in rendering bugs." ] }, "StallProcess": { "Type": "bool", "Default": "false", "Desc": [ "Forces a process to stall out on initialization", "Useful for a process that keeps restarting and doesn't work" ] }, "HideHypervisorBit": { "Type": "bool", "Default": "false", "Desc": [ "Hides the hypervisor CPUID bit when set.", "Should only be used for applications that have issues with this set." ] }, "StartupSleep": { "Type": "uint32", "Default": "0", "Desc": [ "Sleeps the process at startup for a duration of seconds.", "Useful if an application crashes too quickly to attach a debugger." ] }, "StartupSleepProcName": { "Type": "str", "Default": "", "Desc": [ "Contrains the startup sleep to only apply to processes that match this name." ] }, "MonoHacks": { "Type": "bool", "Default": "true", "Desc": [ "Permits a hook-based SMC approach and smaller JIT blocks when mono is detected." ] } }, "Misc": { "ServerSocketPath": { "Type": "str", "Default": "", "Desc": [ "Override for a FEXServer socket path. Only useful for chroots." ] }, "NeedsSeccomp": { "Type": "bool", "Default": "false", "Desc": [ "Disables inline syscalls in order to support seccomp handling" ] }, "ExtendedVolatileMetadata": { "Type": "str", "Default": "", "Desc": [ "Configuration provided volatile metadata. Only implemented for WoW64/arm64ec.", "Limited in its use but can be handy.", "Extends on top of what Microsoft has for volatile metadata, but also supported for WoW64.", "Colon delimited modules, then semi-colon delimited instructions, then comma delimited ranges", "Default disables TSO in the module, unless instructions overlap the range", ";-,...;,...:", "examples:", " * Disable TSO for a full module: Just provide the module name:", " `hl2_linux`", " * Disable TSO for a part of the module:", " `hl2_linux;-`", " * Disable TSO for a part of the module, but enable TSO for some instructions within the module", " `hl2_linux;-;,`", " * Disable TSO for multiple modules", " `hl2_linux:libsdl2.so`" ] } } }, "UnnamedOptions": { "Misc": { "INTERPRETER_INSTALLED": { "Type": "bool", "Default": "false" }, "APP_FILENAME": { "Type": "str", "Default": "" }, "APP_CONFIG_NAME": { "Type": "str", "Default": "", "Desc": [ "This is the application config name that has been loaded.", "This differs from APP_FILENAME in two ways", "Where APP_FILENAME always points to the executable path that FEX-Emu is executing.", "This matches what is used to load the AppLayer configuration name.", "When running through a compatibility layer like wine, this will only be the exe name, instead of wine full path." ] }, "IS64BIT_MODE": { "Type": "bool", "Default": "false" }, "DISABLE_VIXL_INDIRECT_RUNTIME_CALLS": { "Type": "bool", "Default": "true", "Desc": [ "This option is used for the InstructionCountCI so it can generate the same codegen between Arm64 hosts and vixl simulator hosts.", "Vixl simulator indirect runtime calls are a special hlt instruction with metadata after it. Effectively making a custom call instruction.", "With visual simulator calls disabled, the code generation would be the same as on a native Arm64 host, but running the code is broken." ] } } } } ================================================ FILE: FEXCore/Source/Interface/Context/Context.cpp ================================================ // SPDX-License-Identifier: MIT #include "Interface/Context/Context.h" #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include #include #include #include #include #include #include #include "FEXCore/Debug/InternalThreadState.h" namespace FEXCore::Context { fextl::unique_ptr FEXCore::Context::Context::CreateNewContext(const FEXCore::HostFeatures& Features) { return fextl::make_unique(Features); } void FEXCore::Context::ContextImpl::CompileRIP(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP) { CompileBlock(Thread->CurrentFrame, GuestRIP); } void FEXCore::Context::ContextImpl::CompileRIPCount(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, uint64_t MaxInst) { CompileBlock(Thread->CurrentFrame, GuestRIP, MaxInst); } void FEXCore::Context::ContextImpl::SetSignalDelegator(FEXCore::SignalDelegator* _SignalDelegation) { SignalDelegation = _SignalDelegation; } void FEXCore::Context::ContextImpl::SetSyscallHandler(FEXCore::HLE::SyscallHandler* Handler) { SyscallHandler = Handler; SourcecodeResolver = Handler->GetSourcecodeResolver(); } void FEXCore::Context::ContextImpl::SetThunkHandler(FEXCore::ThunkHandler* Handler) { ThunkHandler = Handler; } FEXCore::CPUID::FunctionResults FEXCore::Context::ContextImpl::RunCPUIDFunction(uint32_t Function, uint32_t Leaf) { return CPUID.RunFunction(Function, Leaf); } FEXCore::CPUID::XCRResults FEXCore::Context::ContextImpl::RunXCRFunction(uint32_t Function) { return CPUID.RunXCRFunction(Function); } FEXCore::CPUID::FunctionResults FEXCore::Context::ContextImpl::RunCPUIDFunctionName(uint32_t Function, uint32_t Leaf, uint32_t CPU) { return CPUID.RunFunctionName(Function, Leaf, CPU); } bool FEXCore::Context::ContextImpl::IsAddressInCodeBuffer(FEXCore::Core::InternalThreadState* Thread, uintptr_t Address) const { return Thread->CPUBackend->IsAddressInCodeBuffer(Address); } } // namespace FEXCore::Context ================================================ FILE: FEXCore/Source/Interface/Context/Context.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Common/JitSymbols.h" #include "Interface/Core/CPUBackend.h" #include "Interface/Core/CPUID.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore { class SignalDelegator; class ThunkHandler; struct LookupCacheWriteLockToken; namespace Core { struct DebugData; struct InternalThreadState; } // namespace Core namespace CPU { class Dispatcher; } // namespace CPU namespace HLE { class SourcecodeResolver; class SyscallHandler; } // namespace HLE } // namespace FEXCore namespace FEXCore::Context { struct FEX_PACKED ExitFunctionLinkData { uint64_t HostCode; uint64_t GuestRIP; int64_t CallerOffset; }; struct CustomIRResult { void* Creator; void* Data; CustomIRResult(void* Creator, void* Data) : Creator(Creator) , Data(Data) {} }; using BlockDelinkerFunc = void (*)(FEXCore::Context::ExitFunctionLinkData* Record); constexpr uint32_t TSC_SCALE_MAXIMUM = 1'000'000'000; ///< 1Ghz class CodeCache : public AbstractCodeCache { public: CodeCache(ContextImpl&); ~CodeCache(); ContextImpl& CTX; fextl::unique_ptr ValidationCTX; fextl::unique_ptr ValidationThread; FEXCore::Core::CPUState::gdt_segment ValidationGDT[32] {}; bool IsGeneratingCache = false; FEX_CONFIG_OPT(EnableCodeCaching, ENABLECODECACHINGWIP); FEX_CONFIG_OPT(EnableCodeCacheValidation, ENABLECODECACHEVALIDATION); uint64_t ComputeCodeMapId(std::string_view Filename, int FD) override; bool SaveData(Core::InternalThreadState&, int TargetFD, const ExecutableFileSectionInfo&, uint64_t SerializedBaseAddress) override; bool LoadData(Core::InternalThreadState*, std::byte* MappedCacheFile, const ExecutableFileSectionInfo&) override; /** * Performs expensive extra validation on the loaded code cache data. * * This kicks off an in-process recompile of all cached blocks and compares * them with the cached data. Differences will be reported as fatal errors, * which can uncover bugs like for example: * - mismatches of the JIT configuration used during cache generation * - hidden position dependencies due to missing FEX relocations * - incorrect instruction padding */ void Validate(const ExecutableFileSectionInfo&, fextl::set GuestBlocks, const fextl::set& HostBlocks, std::span CachedCode); void InitiateCacheGeneration() override { IsGeneratingCache = true; } /** * Applies a set of FEX relocations to the given code section. * * FEX relocations describe runtime-dependencies of FEX-generated code. * When loading a code cache, they are used to move cached code to the * dynamically chosen base address of the guest binary. * * Conversely, relocations are applied in reverse when writing code caches * to ensure consistency across generation runs. * * Note that FEX relocations are unrelated to ELF/PE relocations. * * @param GuestDelta Guest address offset to apply to RIP-relative data * @param ForStorage True for serializing data (producing deterministic output); false for de-serializing it (resolving dynamic symbols) * * @return Returns true on success */ [[nodiscard]] bool ApplyCodeRelocations(uint64_t GuestDelta, std::span Code, std::span Relocations, bool ForStorage); }; class ContextImpl final : public FEXCore::Context::Context, public CPU::CodeBufferManager { public: // Context base class implementation. bool InitCore() override; void ExecuteThread(FEXCore::Core::InternalThreadState* Thread) override; bool CheckIfBlockIsCacheable(FEXCore::Core::InternalThreadState&, uint64_t GuestRIP, uint64_t MaxInst) override; void CompileRIP(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP) override; void CompileRIPCount(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, uint64_t MaxInst) override; void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) override; bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) override; bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) override; uint64_t GetGuestBlockEntry(FEXCore::Core::InternalThreadState* Thread) override; uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) override; uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, const uint64_t* HostGPRs, uint64_t PSTATE) override; void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) override; void ReconstructXMMRegisters(const FEXCore::Core::InternalThreadState* Thread, __uint128_t* XMM_Low, __uint128_t* YMM_High) override; void SetXMMRegistersFromState(FEXCore::Core::InternalThreadState* Thread, const __uint128_t* XMM_Low, const __uint128_t* YMM_High) override; /** * @brief Used to create FEX thread objects in preparation for creating a true OS thread. Does set a TID or PID. * * @param InitialRIP The starting RIP of this thread * @param StackPointer The starting RSP of this thread * @param NewThreadState The initial thread state to setup for our state, if inheriting. * * @return The InternalThreadState object that tracks all of the emulated thread's state * * Usecases: * Parent thread Creation: * - Thread = CreateThread(InitialRIP, InitialStack, nullptr, 0); * - CTX->ExecuteThread(Thread); * OS thread Creation: * - Thread = CreateThread(0, 0, NewState, PPID); * - Thread->ExecutionThread = FEXCore::Threads::Thread::Create(ThreadHandler, Arg); * - ThreadHandler calls `CTX->ExecuteThread(Thread)` * OS fork (New thread created with a clone of thread state): * - clone{2, 3} * - Thread = CreateThread(0, 0, CopyOfThreadState, PPID); * - ExecuteThread(Thread); // Starts executing without creating another host thread * Thunk callback executing guest code from native host thread * - Thread = CreateThread(0, 0, NewState, PPID); * - HandleCallback(Thread, RIP); */ FEXCore::Core::InternalThreadState* CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState) override; /** * @brief Destroys this FEX thread object and stops tracking it internally * * @param Thread The internal FEX thread state object */ void DestroyThread(FEXCore::Core::InternalThreadState* Thread) override; #ifndef _WIN32 void LockBeforeFork(FEXCore::Core::InternalThreadState* Thread) override; void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child) override; #endif void SetSignalDelegator(FEXCore::SignalDelegator* SignalDelegation) override; void SetSyscallHandler(FEXCore::HLE::SyscallHandler* Handler) override; void SetThunkHandler(FEXCore::ThunkHandler* Handler) override; FEXCore::CPUID::FunctionResults RunCPUIDFunction(uint32_t Function, uint32_t Leaf) override; FEXCore::CPUID::XCRResults RunXCRFunction(uint32_t Function) override; FEXCore::CPUID::FunctionResults RunCPUIDFunctionName(uint32_t Function, uint32_t Leaf, uint32_t CPU) override; CodeCache& GetCodeCache() override { return CodeCache; } void SetCodeMapWriter(fextl::unique_ptr Writer) override { CodeMapWriter = std::move(Writer); } void FlushAndCloseCodeMap() override { if (CodeMapWriter) { CodeMapWriter.reset(); } } void OnCodeBufferAllocated(const std::shared_ptr&) override; void ClearCodeCache(FEXCore::Core::InternalThreadState* Thread, bool NewCodeBuffer = true) override; void InvalidateCodeBuffersCodeRange(uint64_t Start, uint64_t Length) override; void InvalidateThreadCachedCodeRange(FEXCore::Core::InternalThreadState* Thread, uint64_t Start, uint64_t Length) override; FEXCore::ForkableSharedMutex& GetCodeInvalidationMutex() override { return CodeInvalidationMutex; } void ConfigureAOTGen(FEXCore::Core::InternalThreadState* Thread, fextl::set* ExternalBranches, uint64_t SectionMaxAddress) override; bool IsAddressInCodeBuffer(FEXCore::Core::InternalThreadState* Thread, uintptr_t Address) const override; // returns false if a handler was already registered std::optional AddCustomIREntrypoint(uintptr_t Entrypoint, CustomIREntrypointHandler Handler, void* Creator = nullptr, void* Data = nullptr); void AddThunkTrampolineIRHandler(uintptr_t Entrypoint, uintptr_t GuestThunkEntrypoint) override; void AddForceTSOInformation(const IntervalList& ValidRanges, fextl::set&& Instructions) override; void RemoveForceTSOInformation(uint64_t Address, uint64_t Size) override; void MarkMonoDetected() override { MonoDetected = true; } void MarkMonoBackpatcherBlock(uint64_t BlockEntry) override; public: struct { uint64_t VirtualMemSize {1ULL << 36}; uint64_t TSCScale = 0; // Used if the JIT needs to have its interrupt fault code emitted. bool NeedsPendingInterruptFaultCheck {false}; FEX_CONFIG_OPT(Multiblock, MULTIBLOCK); FEX_CONFIG_OPT(SingleStepConfig, SINGLESTEP); FEX_CONFIG_OPT(GdbServer, GDBSERVER); FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); FEX_CONFIG_OPT(TSOEnabled, TSOENABLED); FEX_CONFIG_OPT(VectorTSOEnabled, VECTORTSOENABLED); FEX_CONFIG_OPT(MemcpySetTSOEnabled, MEMCPYSETTSOENABLED); FEX_CONFIG_OPT(SMCChecks, SMCCHECKS); FEX_CONFIG_OPT(MaxInstPerBlock, MAXINST); FEX_CONFIG_OPT(RootFSPath, ROOTFS); FEX_CONFIG_OPT(GlobalJITNaming, GLOBALJITNAMING); FEX_CONFIG_OPT(LibraryJITNaming, LIBRARYJITNAMING); FEX_CONFIG_OPT(BlockJITNaming, BLOCKJITNAMING); FEX_CONFIG_OPT(GDBSymbols, GDBSYMBOLS); FEX_CONFIG_OPT(x87ReducedPrecision, X87REDUCEDPRECISION); FEX_CONFIG_OPT(DisableTelemetry, DISABLETELEMETRY); FEX_CONFIG_OPT(DisableVixlIndirectCalls, DISABLE_VIXL_INDIRECT_RUNTIME_CALLS); FEX_CONFIG_OPT(SmallTSCScale, SMALLTSCSCALE); FEX_CONFIG_OPT(StrictInProcessSplitLocks, STRICTINPROCESSSPLITLOCKS); FEX_CONFIG_OPT(MonoHacks, MONOHACKS); } Config; FEXCore::ForkableSharedMutex CodeInvalidationMutex; uint32_t StrictSplitLockMutex {}; FEXCore::HostFeatures HostFeatures; // CPUID depends on HostFeatures so needs to be initialized after that. FEXCore::CPUIDEmu CPUID; FEXCore::HLE::SyscallHandler* SyscallHandler {}; FEXCore::HLE::SourcecodeResolver* SourcecodeResolver {}; FEXCore::ThunkHandler* ThunkHandler {}; fextl::unique_ptr Dispatcher; CodeCache CodeCache; fextl::unique_ptr CodeMapWriter; SignalDelegator* SignalDelegation {}; ContextImpl(const FEXCore::HostFeatures& Features); static void ThreadRemoveCodeEntryFromJit(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP); // This is used as a replacement for the SMC writes in the mono callsite backpatcher that avoids atomic operations // (safe as the invalidation mutex is locked) and manually invalidates the modified range. Allowing SMC to be detected // even if faulting is disabled. static void MonoBackpatcherWrite(FEXCore::Core::CpuStateFrame* Frame, uint8_t Size, uint64_t Address, uint64_t Value); void RemoveCustomIREntrypoint(FEXCore::Core::InternalThreadState* Thread, uintptr_t Entrypoint); struct GenerateIRResult { std::optional IRView; uint64_t TotalInstructions; uint64_t TotalInstructionsLength; uint64_t StartAddr; uint64_t Length; bool NeedsAddGuestCodeRanges; }; [[nodiscard]] GenerateIRResult GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, bool ExtendedDebugInfo, uint64_t MaxInst); struct CompileCodeResult { CPU::CPUBackend::CompiledCode CompiledCode; fextl::unique_ptr DebugData; uint64_t StartAddr; uint64_t Length; bool NeedsAddGuestCodeRanges; }; [[nodiscard]] CompileCodeResult CompileCode(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, uint64_t MaxInst = 0); uintptr_t CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst = 0); uintptr_t CompileSingleStep(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP); FEXCore::JITSymbols Symbols; FEXCore::Utils::PooledAllocatorVirtual OpDispatcherAllocator {"FEXMem_OpDispatcher"}; FEXCore::Utils::PooledAllocatorVirtual FrontendAllocator {"FEXMem_Frontend"}; FEXCore::Utils::PooledAllocatorVirtualWithGuard CPUBackendAllocator {"FEXMem_CPUBackend"}; // If Atomic-based TSO emulation is enabled or not. bool IsAtomicTSOEnabled() const { return AtomicTSOEmulationEnabled; } // If atomic-based TSO emulation is enabled for vector operations. bool IsVectorAtomicTSOEnabled() const { return VectorAtomicTSOEmulationEnabled; } // If atomic-based TSO emulation is enabled for memcpy operations. bool IsMemcpyAtomicTSOEnabled() const { return MemcpyAtomicTSOEmulationEnabled; } void SetHardwareTSOSupport(bool HardwareTSOSupported) override { SupportsHardwareTSO = HardwareTSOSupported; UpdateAtomicTSOEmulationConfig(); } void EnableExitOnHLT() override { ExitOnHLT = true; } bool ExitOnHLTEnabled() const { return ExitOnHLT; } bool AreMonoHacksActive() const { return Config.MonoHacks && MonoDetected; } protected: void UpdateAtomicTSOEmulationConfig() { if (SupportsHardwareTSO) { // If the hardware supports TSO then we don't need to emulate it through atomics. AtomicTSOEmulationEnabled = false; VectorAtomicTSOEmulationEnabled = false; MemcpyAtomicTSOEmulationEnabled = false; } else { AtomicTSOEmulationEnabled = Config.TSOEnabled; VectorAtomicTSOEmulationEnabled = Config.TSOEnabled && Config.VectorTSOEnabled; MemcpyAtomicTSOEmulationEnabled = Config.TSOEnabled && Config.MemcpySetTSOEnabled; } } private: /** * @brief Initializes the JIT compilers for the thread * * @param State The internal FEX thread state object * * InitializeCompiler is called inside of CreateThread, so you likely don't need this */ void InitializeCompiler(FEXCore::Core::InternalThreadState* Thread); bool SupportsHardwareTSO = false; bool AtomicTSOEmulationEnabled = true; bool VectorAtomicTSOEmulationEnabled = false; bool MemcpyAtomicTSOEmulationEnabled = false; bool ExitOnHLT = false; FEX_CONFIG_OPT(AppFilename, APP_FILENAME); std::shared_mutex CustomIRMutex; std::atomic HasCustomIRHandlers {}; struct CustomIRHandlerEntry final { CustomIREntrypointHandler Handler; void* Creator; void* Data; }; fextl::unordered_map CustomIRHandlers; IntervalList ForceTSOValidRanges; // The ranges for which ForceTSOInstructions has populated data fextl::set ForceTSOInstructions; bool MonoDetected = false; std::atomic MonoBackpatcherBlock; std::mutex CodeBufferListLock; fextl::vector> CodeBufferList; }; } // namespace FEXCore::Context ================================================ FILE: FEXCore/Source/Interface/Core/Addressing.cpp ================================================ // SPDX-License-Identifier: MIT #include "Interface/Core/Addressing.h" #include "Interface/IR/IREmitter.h" #include "FEXCore/Utils/MathUtils.h" #include "Interface/IR/IR.h" namespace FEXCore::IR { Ref LoadEffectiveAddress(IREmitter* IREmit, const AddressMode& A, IR::OpSize GPRSize, bool AddSegmentBase, bool AllowUpperGarbage) { Ref Tmp = A.Base; if (A.Offset) { Tmp = Tmp ? IREmit->Add(GPRSize, Tmp, A.Offset) : IREmit->Constant(A.Offset); } if (A.Index) { if (A.IndexScale != 1) { uint32_t Log2 = FEXCore::ilog2(A.IndexScale); if (Tmp) { Tmp = IREmit->_AddShift(GPRSize, Tmp, A.Index, ShiftType::LSL, Log2); } else { Tmp = IREmit->_Lshl(GPRSize, A.Index, IREmit->Constant(Log2)); } } else { Tmp = Tmp ? IREmit->Add(GPRSize, Tmp, A.Index) : A.Index; } } // For 64-bit AddrSize can be 32-bit or 64-bit // For 32-bit AddrSize can be 32-bit or 16-bit // // If the AddrSize is not the GPRSize then we need to clear the upper bits. if ((A.AddrSize < GPRSize) && !AllowUpperGarbage && Tmp) { uint32_t Bits = IR::OpSizeAsBits(A.AddrSize); if (A.Base || A.Index) { Tmp = IREmit->_Bfe(GPRSize, Bits, 0, Tmp); } else if (A.Offset) { uint64_t X = A.Offset; X &= (1ull << Bits) - 1; Tmp = IREmit->Constant(X); } } if (A.Segment && AddSegmentBase) { Tmp = Tmp ? IREmit->Add(GPRSize, Tmp, A.Segment) : A.Segment; } return Tmp ?: IREmit->Constant(0); } AddressMode SelectAddressMode(IREmitter* IREmit, const AddressMode& A, IR::OpSize GPRSize, bool HostSupportsTSOImm9, bool AtomicTSO, bool Vector, IR::OpSize AccessSize) { const auto Is32Bit = GPRSize == OpSize::i32Bit; const auto GPRSizeMatchesAddrSize = A.AddrSize == GPRSize; const auto OffsetIndexToLargeFor32Bit = Is32Bit && (A.Offset <= -16384 || A.Offset >= 16384); if (!GPRSizeMatchesAddrSize || OffsetIndexToLargeFor32Bit) { // If address size doesn't match GPR size then no optimizations can occur. return { .Base = LoadEffectiveAddress(IREmit, A, GPRSize, true), .Index = IREmit->Invalid(), }; } // Loadstore rules: // Non-TSO GPR: // * LDR/STR: [Reg] // * LDR/STR: [Reg + Reg, {Shift }] // * Can't use with 32-bit // * LDR/STR: [Reg + [0,4095] * ] // * Imm must be smaller than 16k with 32-bit // * LDUR/STUR: [Reg + [-256, 255]] // // TSO GPR: // * ARMv8.0: // LDAR/STLR: [Reg] // * FEAT_LRCPC: // LDAPR: [Reg] // * FEAT_LRCPC2: // LDAPUR/STLUR: [Reg + [-256, 255]] // // Non-TSO Vector: // * LDR/STR: [Reg + [0,4095] * ] // * LDUR/STUR: [Reg + [-256,255]] // // TSO Vector: // * ARMv8.0: // Just DMB + previous // * FEAT_LRCPC3 (Unsupported by FEXCore currently): // LDAPUR/STLUR: [Reg + [-256,255]] const auto AccessSizeAsImm = OpSizeToSize(AccessSize); const bool OffsetIsSIMM9 = A.Offset && A.Offset >= -256 && A.Offset <= 255; const bool OffsetIsUnsignedScaled = A.Offset > 0 && (A.Offset & (AccessSizeAsImm - 1)) == 0 && (A.Offset / AccessSizeAsImm) <= 4095; if ((AtomicTSO && !Vector && HostSupportsTSOImm9 && OffsetIsSIMM9) || (!AtomicTSO && (OffsetIsSIMM9 || OffsetIsUnsignedScaled))) { // Peel off the offset AddressMode B = A; B.Offset = 0; return { .Base = LoadEffectiveAddress(IREmit, B, GPRSize, true /* AddSegmentBase */, false), .Index = IREmit->Constant(A.Offset), .IndexType = MemOffsetType::SXTX, .IndexScale = 1, }; } if (AtomicTSO) { // TODO: LRCPC3 support for vector Imm9. } else if (!Is32Bit && A.Base && (A.Index || A.Segment) && !A.Offset && (A.IndexScale == 1 || A.IndexScale == AccessSizeAsImm)) { AddressMode B = A; // ScaledRegisterLoadstore if (B.Index && B.Segment) { B.Base = IREmit->Add(GPRSize, B.Base, B.Segment); } else if (B.Segment) { B.Index = B.Segment; B.IndexScale = 1; } return B; } if (Vector || !AtomicTSO) { if ((A.Base || A.Segment) && A.Offset) { const bool Const_16K = A.Offset > -16384 && A.Offset < 16384 && GPRSizeMatchesAddrSize && Is32Bit; if (!Is32Bit || Const_16K) { // Peel off the offset AddressMode B = A; B.Offset = 0; return { .Base = LoadEffectiveAddress(IREmit, B, GPRSize, true /* AddSegmentBase */, false), .Index = IREmit->Constant(A.Offset), .IndexType = MemOffsetType::SXTX, .IndexScale = 1, }; } } } // Fallback on software address calculation return { .Base = LoadEffectiveAddress(IREmit, A, GPRSize, true), .Index = IREmit->Invalid(), }; } }; // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/Addressing.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/IR/IR.h" #include namespace FEXCore::IR { class IREmitter; struct AddressMode { Ref Segment {nullptr}; Ref Base {nullptr}; Ref Index {nullptr}; int64_t Offset = 0; MemOffsetType IndexType = MemOffsetType::SXTX; uint8_t IndexScale = 1; // Size in bytes for the address calculation. 8 for an arm64 hardware mode. IR::OpSize AddrSize; bool NonTSO; }; Ref LoadEffectiveAddress(IREmitter* IREmit, const AddressMode& A, IR::OpSize GPRSize, bool AddSegmentBase, bool AllowUpperGarbage = false); AddressMode SelectAddressMode(IREmitter* IREmit, const AddressMode& A, IR::OpSize GPRSize, bool HostSupportsTSOImm9, bool AtomicTSO, bool Vector, IR::OpSize AccessSize); } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp ================================================ // SPDX-License-Identifier: MIT #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Context/Context.h" #include #include #include #include #include #include #include #ifdef VIXL_DISASSEMBLER #include #include #include #include #endif #include #include #include namespace FEXCore::CPU { // LLVM's preserve_all doc, this is used throughout this file and reproduced // here for reference: // // the callee preserve all general purpose registers, // except X0-X8 and X16-X18. Furthermore it also preserves lower 128 bits of // V8-V31 SIMD - floating point registers. // // Note that the call necessarily also clobbers x30, the link register (LR) // which is not considered general purpose. // // Meanwhile, for non-preserve_all, the AAPCS64 ABI says: // // A subroutine invocation must preserve the contents of the registers // r19-r29 and SP. namespace x64 { #ifndef ARCHITECTURE_arm64ec // All but x19 and x29 are caller saved // Note that rax/rdx are rearranged here so we can coalesce cmpxchg. constexpr std::array SRA = { ARMEmitter::Reg::r4, ARMEmitter::Reg::r7, ARMEmitter::Reg::r5, ARMEmitter::Reg::r6, ARMEmitter::Reg::r8, ARMEmitter::Reg::r9, ARMEmitter::Reg::r10, ARMEmitter::Reg::r11, ARMEmitter::Reg::r12, ARMEmitter::Reg::r13, ARMEmitter::Reg::r14, ARMEmitter::Reg::r15, ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r19, ARMEmitter::Reg::r29, // PF/AF must be last. REG_PF, REG_AF, }; // I wish this could get constexpr generated from SRA's definition but impossible until libstdc++12, libc++15. // SRA GPRs that need to be spilled when calling a function with `preserve_all` ABI. constexpr std::array PreserveAll_SRA = { ARMEmitter::Reg::r4, ARMEmitter::Reg::r5, ARMEmitter::Reg::r6, ARMEmitter::Reg::r7, ARMEmitter::Reg::r8, ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, }; constexpr std::array RA = { // All these callee saved ARMEmitter::Reg::r20, ARMEmitter::Reg::r21, ARMEmitter::Reg::r22, ARMEmitter::Reg::r23, ARMEmitter::Reg::r24, ARMEmitter::Reg::r30, ARMEmitter::Reg::r18, }; constexpr unsigned RAPairs = 4; // Dynamic GPRs constexpr std::array PreserveAll_Dynamic = { ARMEmitter::Reg::r18, ARMEmitter::Reg::r30, }; constexpr std::array NotPreserved_Dynamic = PreserveAll_Dynamic; // All are caller saved constexpr std::array SRAFPR = { ARMEmitter::VReg::v16, ARMEmitter::VReg::v17, ARMEmitter::VReg::v18, ARMEmitter::VReg::v19, ARMEmitter::VReg::v20, ARMEmitter::VReg::v21, ARMEmitter::VReg::v22, ARMEmitter::VReg::v23, ARMEmitter::VReg::v24, ARMEmitter::VReg::v25, ARMEmitter::VReg::v26, ARMEmitter::VReg::v27, ARMEmitter::VReg::v28, ARMEmitter::VReg::v29, ARMEmitter::VReg::v30, ARMEmitter::VReg::v31}; // SRA FPRs that need to be spilled when calling a function with `preserve_all` ABI. constexpr std::array PreserveAll_SRAFPR = { // None. }; // v8..v15 = (lower 64bits) Callee saved constexpr std::array RAFPR = { // v0 ~ v1 are used as temps. // ARMEmitter::VReg::v0, ARMEmitter::VReg::v1, ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, ARMEmitter::VReg::v8, ARMEmitter::VReg::v9, ARMEmitter::VReg::v10, ARMEmitter::VReg::v11, ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15, }; constexpr std::array PreserveAll_DynamicFPR = { ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, }; #else constexpr std::array SRA = { ARMEmitter::Reg::r8, ARMEmitter::Reg::r0, ARMEmitter::Reg::r1, ARMEmitter::Reg::r27, // SP's register location isn't specified by the ARM64EC ABI, we choose to use r23 ARMEmitter::Reg::r23, ARMEmitter::Reg::r29, ARMEmitter::Reg::r25, ARMEmitter::Reg::r26, ARMEmitter::Reg::r2, ARMEmitter::Reg::r3, ARMEmitter::Reg::r4, ARMEmitter::Reg::r5, ARMEmitter::Reg::r19, ARMEmitter::Reg::r20, ARMEmitter::Reg::r21, ARMEmitter::Reg::r22, // PF/AF must be last. REG_PF, REG_AF, }; constexpr std::array PreserveAll_SRA = { ARMEmitter::Reg::r0, ARMEmitter::Reg::r1, ARMEmitter::Reg::r2, ARMEmitter::Reg::r3, ARMEmitter::Reg::r4, ARMEmitter::Reg::r5, ARMEmitter::Reg::r8, }; constexpr std::array RA = { ARMEmitter::Reg::r6, ARMEmitter::Reg::r7, ARMEmitter::Reg::r14, ARMEmitter::Reg::r15, ARMEmitter::Reg::r16, ARMEmitter::Reg::r30, }; constexpr std::array PreserveAll_Dynamic = {ARMEmitter::Reg::r6, ARMEmitter::Reg::r7, ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r30}; constexpr std::array NotPreserved_Dynamic = {ARMEmitter::Reg::r6, ARMEmitter::Reg::r7, ARMEmitter::Reg::r14, ARMEmitter::Reg::r15, ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r30}; constexpr unsigned RAPairs = 4; constexpr std::array SRAFPR = { ARMEmitter::VReg::v0, ARMEmitter::VReg::v1, ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, ARMEmitter::VReg::v8, ARMEmitter::VReg::v9, ARMEmitter::VReg::v10, ARMEmitter::VReg::v11, ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15, }; constexpr std::array PreserveAll_SRAFPR = { ARMEmitter::VReg::v0, ARMEmitter::VReg::v1, ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, }; constexpr std::array RAFPR = { ARMEmitter::VReg::v18, ARMEmitter::VReg::v19, ARMEmitter::VReg::v20, ARMEmitter::VReg::v21, ARMEmitter::VReg::v22, ARMEmitter::VReg::v23, ARMEmitter::VReg::v24, ARMEmitter::VReg::v25, ARMEmitter::VReg::v26, ARMEmitter::VReg::v27, ARMEmitter::VReg::v28, ARMEmitter::VReg::v29, ARMEmitter::VReg::v30, ARMEmitter::VReg::v31}; constexpr std::array PreserveAll_DynamicFPR = { // None }; #endif constexpr uint32_t PreserveAll_SRAMask = {[]() -> uint32_t { uint32_t Mask {}; for (auto Reg : PreserveAll_SRA) { switch (Reg.Idx()) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 16: case 17: Mask |= (1U << Reg.Idx()); break; default: break; } } return Mask; }()}; constexpr uint32_t PreserveAll_SRAFPRMask = {[]() -> uint32_t { uint32_t Mask {}; for (auto Reg : PreserveAll_SRAFPR) { Mask |= (1U << Reg.Idx()); } return Mask; }()}; // SRA FPRs that need to be spilled when the host supports SVE-256bit with `preserve_all` ABI. // This is /all/ of the SRA registers constexpr std::array PreserveAll_SRAFPRSVE = SRAFPR; constexpr uint32_t PreserveAll_SRAFPRSVEMask = {[]() -> uint32_t { uint32_t Mask {}; for (auto Reg : PreserveAll_SRAFPRSVE) { Mask |= (1U << Reg.Idx()); } return Mask; }()}; // Dynamic FPRs when the host supports SVE-256bit. constexpr std::array PreserveAll_DynamicFPRSVE = { // v0 ~ v1 are used as temps. ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, ARMEmitter::VReg::v8, ARMEmitter::VReg::v9, ARMEmitter::VReg::v10, ARMEmitter::VReg::v11, ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15, }; } // namespace x64 namespace x32 { // All but x19 and x29 are caller saved. eax/edx rearranged for cmpxchg. constexpr std::array SRA = { ARMEmitter::Reg::r4, ARMEmitter::Reg::r7, ARMEmitter::Reg::r5, ARMEmitter::Reg::r6, ARMEmitter::Reg::r8, ARMEmitter::Reg::r9, ARMEmitter::Reg::r10, ARMEmitter::Reg::r11, // PF/AF must be last. REG_PF, REG_AF, }; constexpr std::array RA = { // All these callee saved ARMEmitter::Reg::r20, ARMEmitter::Reg::r21, ARMEmitter::Reg::r22, ARMEmitter::Reg::r23, // Registers only available on 32-bit // All these are caller saved (except for r19). ARMEmitter::Reg::r12, ARMEmitter::Reg::r13, ARMEmitter::Reg::r14, ARMEmitter::Reg::r15, ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r29, ARMEmitter::Reg::r30, ARMEmitter::Reg::r24, ARMEmitter::Reg::r19, }; constexpr std::array NotPreserved_Dynamic = { ARMEmitter::Reg::r12, ARMEmitter::Reg::r13, ARMEmitter::Reg::r14, ARMEmitter::Reg::r15, ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r30, }; constexpr unsigned RAPairs = 10; // All are caller saved constexpr std::array SRAFPR = { ARMEmitter::VReg::v16, ARMEmitter::VReg::v17, ARMEmitter::VReg::v18, ARMEmitter::VReg::v19, ARMEmitter::VReg::v20, ARMEmitter::VReg::v21, ARMEmitter::VReg::v22, ARMEmitter::VReg::v23, }; // v8..v15 = (lower 64bits) Callee saved constexpr std::array RAFPR = { // v0 ~ v1 are used as temps. // ARMEmitter::VReg::v0, ARMEmitter::VReg::v1, ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, ARMEmitter::VReg::v8, ARMEmitter::VReg::v9, ARMEmitter::VReg::v10, ARMEmitter::VReg::v11, ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15, ARMEmitter::VReg::v24, ARMEmitter::VReg::v25, ARMEmitter::VReg::v26, ARMEmitter::VReg::v27, ARMEmitter::VReg::v28, ARMEmitter::VReg::v29, ARMEmitter::VReg::v30, ARMEmitter::VReg::v31}; // I wish this could get constexpr generated from SRA's definition but impossible until libstdc++12, libc++15. // SRA GPRs that need to be spilled when calling a function with `preserve_all` ABI. constexpr std::array PreserveAll_SRA = { ARMEmitter::Reg::r4, ARMEmitter::Reg::r5, ARMEmitter::Reg::r6, ARMEmitter::Reg::r7, ARMEmitter::Reg::r8, }; constexpr uint32_t PreserveAll_SRAMask = {[]() -> uint32_t { uint32_t Mask {}; for (auto Reg : PreserveAll_SRA) { switch (Reg.Idx()) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 16: case 17: Mask |= (1U << Reg.Idx()); break; default: break; } } return Mask; }()}; // Dynamic GPRs constexpr std::array PreserveAll_Dynamic = {ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r30}; // SRA FPRs that need to be spilled when calling a function with `preserve_all` ABI. constexpr uint32_t PreserveAll_SRAFPRMask = 0; // Dynamic FPRs // - v0-v7 constexpr std::array PreserveAll_DynamicFPR = { // v0 ~ v1 are temps ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, }; // SRA FPRs that need to be spilled when the host supports SVE-256bit with `preserve_all` ABI. // This is /all/ of the SRA registers constexpr std::array PreserveAll_SRAFPRSVE = SRAFPR; constexpr uint32_t PreserveAll_SRAFPRSVEMask = {[]() -> uint32_t { uint32_t Mask {}; for (auto Reg : PreserveAll_SRAFPRSVE) { Mask |= (1U << Reg.Idx()); } return Mask; }()}; // Dynamic FPRs when the host supports SVE-256bit. constexpr std::array PreserveAll_DynamicFPRSVE = { // v0 ~ v1 are used as temps. ARMEmitter::VReg::v2, ARMEmitter::VReg::v3, ARMEmitter::VReg::v4, ARMEmitter::VReg::v5, ARMEmitter::VReg::v6, ARMEmitter::VReg::v7, ARMEmitter::VReg::v8, ARMEmitter::VReg::v9, ARMEmitter::VReg::v10, ARMEmitter::VReg::v11, ARMEmitter::VReg::v12, ARMEmitter::VReg::v13, ARMEmitter::VReg::v14, ARMEmitter::VReg::v15, ARMEmitter::VReg::v24, ARMEmitter::VReg::v25, ARMEmitter::VReg::v26, ARMEmitter::VReg::v27, ARMEmitter::VReg::v28, ARMEmitter::VReg::v29, ARMEmitter::VReg::v30, ARMEmitter::VReg::v31}; } // namespace x32 // We want vixl to not allocate a default buffer. Jit and dispatcher will manually create one. Arm64Emitter::Arm64Emitter(FEXCore::Context::ContextImpl* ctx, void* EmissionPtr, size_t size) : Emitter(static_cast(EmissionPtr), size) , EmitterCTX {ctx} #ifdef VIXL_SIMULATOR , Simulator {&SimDecoder, stdout, vixl::aarch64::SimStack(SimulatorStackSize).Allocate()} #endif { #ifdef VIXL_SIMULATOR FEX_CONFIG_OPT(ForceSVEWidth, FORCESVEWIDTH); // Hardcode a 256-bit vector width if we are running in the simulator. // Allow the user to override this. Simulator.SetVectorLengthInBits(ForceSVEWidth() ? ForceSVEWidth() : 256); // FEX doesn't support GCS. Simulator.DisableGCSCheck(); #endif #ifdef VIXL_DISASSEMBLER // Only setup the disassembler if enabled. // vixl's decoder is expensive to setup. if (Disassemble()) { DisasmBuffer.resize(DISASM_BUFFER_SIZE); Disasm = fextl::make_unique(DisasmBuffer.data(), DISASM_BUFFER_SIZE); DisasmDecoder = fextl::make_unique(); DisasmDecoder->AppendVisitor(Disasm.get()); } #endif // Number of register available is dependent on what operating mode the proccess is in. if (EmitterCTX->Config.Is64BitMode()) { StaticRegisters = x64::SRA; GeneralRegisters = x64::RA; GeneralRegistersNotPreserved = x64::NotPreserved_Dynamic; StaticFPRegisters = x64::SRAFPR; GeneralFPRegisters = x64::RAFPR; PairRegisters = x64::RAPairs; } else { PairRegisters = x32::RAPairs; StaticRegisters = x32::SRA; GeneralRegisters = x32::RA; GeneralRegistersNotPreserved = x32::NotPreserved_Dynamic; StaticFPRegisters = x32::SRAFPR; GeneralFPRegisters = x32::RAFPR; } } FEXCore::X86State::X86Reg Arm64Emitter::GetX86RegRelationToARMReg(ARMEmitter::Register Reg) { for (size_t i = 0; i < StaticRegisters.size(); ++i) { const auto& RegI = StaticRegisters[i]; if (RegI == Reg) { // X86 Registers are mapped linerally from the StaticRegisters span. // Directly correlating Enum index to span index. return static_cast(FEXCore::ToUnderlying(FEXCore::X86State::X86Reg::REG_RAX) + i); } } // Unmapped register. return FEXCore::X86State::X86Reg::REG_INVALID; } void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, uint64_t Constant, PadType Pad, int MaxBytes) { bool NOPPad = false; if (Pad == PadType::DOPAD) { NOPPad = true; } else if (Pad == PadType::NOPAD) { NOPPad = false; } else if (Pad == PadType::AUTOPAD) { // Force NOP padding to ensure relocated constants always have enough encoding space available NOPPad = EnableCodeCaching; } bool Is64Bit = s == ARMEmitter::Size::i64Bit; const auto UpperBound = Is64Bit ? 4 : 2; int Segments = MaxBytes ? (MaxBytes / 2) : UpperBound; LOGMAN_THROW_A_FMT(MaxBytes >= 0 && MaxBytes <= (UpperBound * 2) && (MaxBytes & 1) == 0, "MaxBytes must be bounded in the range of [0, {}] and 16-bit aligned", UpperBound); // If MaxBytes specified then make sure to sanity check incoming data. LOGMAN_THROW_A_FMT(MaxBytes == 0 || (Constant >> (MaxBytes * 8)) == 0, "MaxBytes provided but data can't fit within provided range."); if (Is64Bit && ((~Constant) >> 16) == 0) { if (NOPPad) { nop(); nop(); nop(); } movn(s, Reg, (~Constant) & 0xFFFF); return; } if ((Constant >> 32) == 0 && !NOPPad) { // If the upper 32-bits is all zero, we can now switch to a 32-bit move. // NOTE: The NOP padding code does not appropriately adjust to this yet, // so we skip this optimization in that case s = ARMEmitter::Size::i32Bit; Is64Bit = false; Segments = std::min(Segments, 2); } if (!Is64Bit && ((~Constant) & 0xFFFF0000) == 0) { if (NOPPad) { nop(); nop(); nop(); } movn(s, Reg.W(), (~Constant) & 0xFFFF); return; } int RequiredMoveSegments {}; // Count the number of move segments // We only want to use ADRP+ADD if we have more than 1 segment for (size_t i = 0; i < Segments; ++i) { uint16_t Part = (Constant >> (i * 16)) & 0xFFFF; if (Part != 0) { ++RequiredMoveSegments; } } // If this can be loaded with a mov bitmask. if (RequiredMoveSegments > 1) { // Only try to use this path if the number of segments is > 1. // `movz` is better than `orr` since hardware will rename or merge if possible when `movz` is used. const auto IsImm = ARMEmitter::Emitter::IsImmLogical(Constant, RegSizeInBits(s)); if (IsImm) { if (NOPPad) { nop(); nop(); nop(); } orr(s, Reg, ARMEmitter::Reg::zr, Constant); return; } } // If we can't handle negatives with the orr, try with movn+movk if (Is64Bit && ((~Constant) >> 32) == 0) { if (NOPPad) { nop(); nop(); } movn(s, Reg, (~Constant) & 0xFFFF); movk(s, Reg, (Constant >> 16) & 0xFFFF, 16); return; } // ADRP+ADD is specifically optimized in hardware // Check if we can use this auto PC = GetCursorAddress(); // PC aligned to page uint64_t AlignedPC = PC & ~0xFFFULL; // Offset from aligned PC auto AlignedOffset = std::bit_cast(Constant - AlignedPC); int NumMoves = 0; // If the aligned offset is within the 4GB window then we can use ADRP+ADD // and the number of move segments more than 1 // NOTE: JIT output is moved to a different buffer after compilation, so the // current cursor address doesn't match the runtime instruction address. // Hence this optimization is disabled until we enable code relocation patches. if (RequiredMoveSegments > 1 && ARMEmitter::Emitter::IsInt32(AlignedOffset) && false) { // If this is 4k page aligned then we only need ADRP if ((AlignedOffset & 0xFFF) == 0) { adrp(Reg, AlignedOffset >> 12); } else { // If the constant is within 1MB of PC then we can still use ADR to load in a single instruction // 21-bit signed integer here auto SmallOffset = std::bit_cast(Constant - PC); if (ARMEmitter::Emitter::IsInt21(SmallOffset)) { adr(Reg, SmallOffset); } else { // Need to use ADRP + ADD adrp(Reg, AlignedOffset >> 12); add(s, Reg, Reg, Constant & 0xFFF); NumMoves = 2; } } } else { int CurrentSegment = 0; for (; CurrentSegment < Segments; ++CurrentSegment) { uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF; if (Part) { movz(s, Reg, Part, CurrentSegment * 16); ++CurrentSegment; ++NumMoves; break; } } for (; CurrentSegment < Segments; ++CurrentSegment) { uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF; if (Part) { movk(s, Reg, Part, CurrentSegment * 16); ++NumMoves; } } if (NumMoves == 0) { // If we didn't move anything that means this is a zero move. Special case this. movz(s, Reg, 0); ++NumMoves; } } if (NOPPad) { for (int i = NumMoves; i < Segments; ++i) { nop(); } } } void Arm64Emitter::PushCalleeSavedRegisters() { // We need to save pairs of registers // We save r19-r30 constexpr static std::array, 6> CalleeSaved = {{ {ARMEmitter::XReg::x19, ARMEmitter::XReg::x20}, {ARMEmitter::XReg::x21, ARMEmitter::XReg::x22}, {ARMEmitter::XReg::x23, ARMEmitter::XReg::x24}, {ARMEmitter::XReg::x25, ARMEmitter::XReg::x26}, {ARMEmitter::XReg::x27, ARMEmitter::XReg::x28}, {ARMEmitter::XReg::x29, ARMEmitter::XReg::x30}, }}; for (auto& RegPair : CalleeSaved) { stp(RegPair.first, RegPair.second, ARMEmitter::Reg::rsp, -16); } // Additionally we need to store the lower 64bits of v8-v15 // Here's a fun thing, we can use two ST4 instructions to store everything // We just need a single sub to sp before that constexpr static std::array< std::tuple, 2> FPRs = {{ {ARMEmitter::DReg::d8, ARMEmitter::DReg::d9, ARMEmitter::DReg::d10, ARMEmitter::DReg::d11}, {ARMEmitter::DReg::d12, ARMEmitter::DReg::d13, ARMEmitter::DReg::d14, ARMEmitter::DReg::d15}, }}; uint32_t VectorSaveSize = sizeof(uint64_t) * 8; sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, VectorSaveSize); // SP supporting move // We just saved x19 so it is safe add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r19, ARMEmitter::Reg::rsp, 0); for (auto& RegQuad : FPRs) { st4(ARMEmitter::SubRegSize::i64Bit, std::get<0>(RegQuad), std::get<1>(RegQuad), std::get<2>(RegQuad), std::get<3>(RegQuad), 0, ARMEmitter::Reg::r19, 32); } } void Arm64Emitter::PopCalleeSavedRegisters() { constexpr static std::array< std::tuple, 2> FPRs = {{ {ARMEmitter::DReg::d8, ARMEmitter::DReg::d9, ARMEmitter::DReg::d10, ARMEmitter::DReg::d11}, {ARMEmitter::DReg::d12, ARMEmitter::DReg::d13, ARMEmitter::DReg::d14, ARMEmitter::DReg::d15}, }}; for (auto& RegQuad : FPRs) { ld4(ARMEmitter::SubRegSize::i64Bit, std::get<0>(RegQuad), std::get<1>(RegQuad), std::get<2>(RegQuad), std::get<3>(RegQuad), 0, ARMEmitter::Reg::rsp, 32); } constexpr static std::array, 6> CalleeSaved = {{ {ARMEmitter::XReg::x29, ARMEmitter::XReg::x30}, {ARMEmitter::XReg::x27, ARMEmitter::XReg::x28}, {ARMEmitter::XReg::x25, ARMEmitter::XReg::x26}, {ARMEmitter::XReg::x23, ARMEmitter::XReg::x24}, {ARMEmitter::XReg::x21, ARMEmitter::XReg::x22}, {ARMEmitter::XReg::x19, ARMEmitter::XReg::x20}, }}; for (auto& RegPair : CalleeSaved) { ldp(RegPair.first, RegPair.second, ARMEmitter::Reg::rsp, 16); } } void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs) { #ifndef VIXL_SIMULATOR if (EmitterCTX->HostFeatures.SupportsAFP) { // Enable AFP features when filling JIT state. mrs(TmpReg, ARMEmitter::SystemRegister::FPCR); // Enable FPCR.NEP and FPCR.AH // NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination. // AH(1): Changes NaN behaviour in some instructions. Specifically fmin, fmax. // // Additional interesting AFP bits: // FIZ(0): Flush Inputs to Zero orr(ARMEmitter::Size::i64Bit, TmpReg, TmpReg, (1U << 2) | // NEP (1U << 1)); // AH if (SetFIZ) { // Insert MXCSR.DAZ in to FIZ ldr(TmpReg2.W(), STATE.R(), offsetof(FEXCore::Core::CPUState, mxcsr)); bfxil(ARMEmitter::Size::i64Bit, TmpReg, TmpReg2, 6, 1); } msr(ARMEmitter::SystemRegister::FPCR, TmpReg); } #endif if (SetPredRegs && (EmitterCTX->HostFeatures.SupportsSVE256 || EmitterCTX->HostFeatures.SupportsSVE128)) { // Set up predicate registers. // We don't bother spilling these in SpillStaticRegs, // since all that matters is we restore them on a fill. // It's not a concern if they get trounced by something else. if (EmitterCTX->HostFeatures.SupportsSVE256) { ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_32B, ARMEmitter::PredicatePattern::SVE_VL32); } if (EmitterCTX->HostFeatures.SupportsSVE128) { ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16); } // Fill in the predicate register for the x87 ldst SVE optimization. ptrue(ARMEmitter::SubRegSize::i16Bit, PRED_X87_SVEOPT, ARMEmitter::PredicatePattern::SVE_VL5); } } void Arm64Emitter::SpillStaticRegs(ARMEmitter::Register TmpReg, SpillStaticRegOptions Options) { #ifndef VIXL_SIMULATOR if (EmitterCTX->HostFeatures.SupportsAFP) { // Disable AFP features when spilling registers. // // Disable FPCR.NEP and FPCR.AH and FPCR.FIZ // NEP(2): Changes ASIMD scalar instructions to insert in to the lower bits of the destination. // AH(1): Changes NaN behaviour in some instructions. Specifically fmin, fmax. // Also interacts with RPRES to change reciprocal/rsqrt precision from 8-bit mantissa to 12-bit. // // Additional interesting AFP bits: // FIZ(0): Flush Inputs to Zero mrs(TmpReg, ARMEmitter::SystemRegister::FPCR); bic(ARMEmitter::Size::i64Bit, TmpReg, TmpReg, (1U << 2) | // NEP (1U << 1) | // AH (1U << 0)); // FIZ msr(ARMEmitter::SystemRegister::FPCR, TmpReg); } #endif if (Options.NZCV) { // Regardless of what GPRs/FPRs we're spilling, we need to spill NZCV since it // is always static and almost certainly clobbered by the subsequent code. // // TODO: Can we prove that NZCV is not used across a call in some cases and // omit this? Might help x87 perf? Future idea. mrs(TmpReg, ARMEmitter::SystemRegister::NZCV); str(TmpReg.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); } // PF/AF are special, remove them from the mask uint32_t PFAFMask = ((1u << REG_PF.Idx()) | ((1u << REG_AF.Idx()))); unsigned PFAFSpillMask = Options.GPRSpillMask & PFAFMask; Options.GPRSpillMask &= ~PFAFSpillMask; str(REG_CALLRET_SP, STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.callret_sp)); for (size_t i = 0; i < StaticRegisters.size(); i += 2) { auto Reg1 = StaticRegisters[i]; auto Reg2 = StaticRegisters[i + 1]; if (((1U << Reg1.Idx()) & Options.GPRSpillMask) && ((1U << Reg2.Idx()) & Options.GPRSpillMask)) { stp(Reg1.X(), Reg2.X(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.gregs, i)); } else if (((1U << Reg1.Idx()) & Options.GPRSpillMask)) { str(Reg1.X(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.gregs, i)); } else if (((1U << Reg2.Idx()) & Options.GPRSpillMask)) { str(Reg2.X(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.gregs, i + 1)); } } // Now handle PF/AF if (Options.NZCV && PFAFSpillMask) { auto PFOffset = offsetof(FEXCore::Core::CpuStateFrame, State.pf_raw); auto AFOffset = offsetof(FEXCore::Core::CpuStateFrame, State.af_raw); LOGMAN_THROW_A_FMT(PFAFSpillMask == PFAFMask, "PF/AF not spilled together"); LOGMAN_THROW_A_FMT(AFOffset == PFOffset + 4, "PF/AF are together"); stp(REG_PF.W(), REG_AF.W(), STATE.R(), PFOffset); } if (Options.FPRs) { if (EmitterCTX->HostFeatures.SupportsAVX && EmitterCTX->HostFeatures.SupportsSVE256) { for (size_t i = 0; i < StaticFPRegisters.size(); i++) { const auto Reg = StaticFPRegisters[i]; if (((1U << Reg.Idx()) & Options.FPRSpillMask) != 0) { mov(ARMEmitter::Size::i64Bit, TmpReg, ARRAY_OFFSETOF(Core::CpuStateFrame, State.xmm.avx.data, i)); st1b(Reg.Z(), PRED_TMP_32B, STATE.R(), TmpReg); } } } else { if (Options.GPRSpillMask && Options.FPRSpillMask == ~0U) { // Optimize the common case where we can spill four registers per instruction // Load the sse offset in to the temporary register add(ARMEmitter::Size::i64Bit, TmpReg, STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data)); for (size_t i = 0; i < StaticFPRegisters.size(); i += 4) { const auto Reg1 = StaticFPRegisters[i]; const auto Reg2 = StaticFPRegisters[i + 1]; const auto Reg3 = StaticFPRegisters[i + 2]; const auto Reg4 = StaticFPRegisters[i + 3]; st1(Reg1.Q(), Reg2.Q(), Reg3.Q(), Reg4.Q(), TmpReg, 64); } } else { for (size_t i = 0; i < StaticFPRegisters.size(); i += 2) { const auto Reg1 = StaticFPRegisters[i]; const auto Reg2 = StaticFPRegisters[i + 1]; if (((1U << Reg1.Idx()) & Options.FPRSpillMask) && ((1U << Reg2.Idx()) & Options.FPRSpillMask)) { stp(Reg1.Q(), Reg2.Q(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.xmm.sse.data, i)); } else if (((1U << Reg1.Idx()) & Options.FPRSpillMask)) { str(Reg1.Q(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.xmm.sse.data, i)); } else if (((1U << Reg2.Idx()) & Options.FPRSpillMask)) { str(Reg2.Q(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.xmm.sse.data, i + 1)); } } } } } } void Arm64Emitter::FillStaticRegs(FillStaticRegOptions Options) { auto FindTempReg = [this](uint32_t* GPRFillMask) -> std::optional { for (auto Reg : StaticRegisters) { if (((1U << Reg.Idx()) & *GPRFillMask)) { *GPRFillMask &= ~(1U << Reg.Idx()); return std::make_optional(Reg); } } return std::nullopt; }; LOGMAN_THROW_A_FMT(Options.GPRFillMask != 0, "Must fill at least 2 GPRs for a temp"); uint32_t TempGPRFillMask = Options.GPRFillMask; if (!Options.OptionalReg.has_value()) { Options.OptionalReg = FindTempReg(&TempGPRFillMask); } if (!Options.OptionalReg2.has_value()) { Options.OptionalReg2 = FindTempReg(&TempGPRFillMask); } LOGMAN_THROW_A_FMT(Options.OptionalReg.has_value() && Options.OptionalReg2.has_value(), "Didn't have an SRA register to use as a " "temporary while " "spilling!"); auto TmpReg = *Options.OptionalReg; auto TmpReg2 = *Options.OptionalReg2; #ifdef ARCHITECTURE_arm64ec // Load STATE in from the CPU area as x28 is not callee saved in the ARM64EC ABI. ldr(TmpReg.X(), ARMEmitter::Reg::r18, TEB_CPU_AREA_OFFSET); ldr(STATE, TmpReg, CPU_AREA_EMULATOR_DATA_OFFSET); #endif ldr(REG_CALLRET_SP, STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.callret_sp)); if (Options.NZCV) { // Regardless of what GPRs/FPRs we're filling, we need to fill NZCV since it // is always static and was almost certainly clobbered. // // TODO: Can we prove that NZCV is not used across a call in some cases and // omit this? Might help x87 perf? Future idea. ldr(TmpReg.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TmpReg); } FillSpecialRegs(TmpReg, TmpReg2, true, Options.FPRs); if (Options.FPRs) { if (EmitterCTX->HostFeatures.SupportsAVX && EmitterCTX->HostFeatures.SupportsSVE256) { for (size_t i = 0; i < StaticFPRegisters.size(); i++) { const auto Reg = StaticFPRegisters[i]; if (((1U << Reg.Idx()) & Options.FPRFillMask) != 0) { mov(ARMEmitter::Size::i64Bit, TmpReg, ARRAY_OFFSETOF(Core::CpuStateFrame, State.xmm.avx.data, i)); ld1b(Reg.Z(), PRED_TMP_32B.Zeroing(), STATE.R(), TmpReg); } } } else { if (Options.GPRFillMask && Options.FPRFillMask == ~0U) { // Optimize the common case where we can fill four registers per instruction. // Use one of the filling static registers before we fill it. // Load the sse offset in to the temporary register add(ARMEmitter::Size::i64Bit, TmpReg, STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data)); for (size_t i = 0; i < StaticFPRegisters.size(); i += 4) { const auto Reg1 = StaticFPRegisters[i]; const auto Reg2 = StaticFPRegisters[i + 1]; const auto Reg3 = StaticFPRegisters[i + 2]; const auto Reg4 = StaticFPRegisters[i + 3]; ld1(Reg1.Q(), Reg2.Q(), Reg3.Q(), Reg4.Q(), TmpReg, 64); } } else { for (size_t i = 0; i < StaticFPRegisters.size(); i += 2) { const auto Reg1 = StaticFPRegisters[i]; const auto Reg2 = StaticFPRegisters[i + 1]; if (((1U << Reg1.Idx()) & Options.FPRFillMask) && ((1U << Reg2.Idx()) & Options.FPRFillMask)) { ldp(Reg1.Q(), Reg2.Q(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.xmm.sse.data, i)); } else if (((1U << Reg1.Idx()) & Options.FPRFillMask)) { ldr(Reg1.Q(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.xmm.sse.data, i)); } else if (((1U << Reg2.Idx()) & Options.FPRFillMask)) { ldr(Reg2.Q(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.xmm.sse.data, i + 1)); } } } } } // PF/AF are special, remove them from the mask uint32_t PFAFMask = ((1u << REG_PF.Idx()) | ((1u << REG_AF.Idx()))); uint32_t PFAFFillMask = Options.GPRFillMask & PFAFMask; Options.GPRFillMask &= ~PFAFMask; for (size_t i = 0; i < StaticRegisters.size(); i += 2) { auto Reg1 = StaticRegisters[i]; auto Reg2 = StaticRegisters[i + 1]; if (((1U << Reg1.Idx()) & Options.GPRFillMask) && ((1U << Reg2.Idx()) & Options.GPRFillMask)) { ldp(Reg1.X(), Reg2.X(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.gregs, i)); } else if ((1U << Reg1.Idx()) & Options.GPRFillMask) { ldr(Reg1.X(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.gregs, i)); } else if ((1U << Reg2.Idx()) & Options.GPRFillMask) { ldr(Reg2.X(), STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, State.gregs, i + 1)); } } // Now handle PF/AF if (Options.NZCV && PFAFFillMask) { LOGMAN_THROW_A_FMT(PFAFFillMask == PFAFMask, "PF/AF not filled together"); ldp(REG_PF.W(), REG_AF.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.pf_raw)); } } void Arm64Emitter::PushVectorRegisters(ARMEmitter::Register TmpReg, bool SVE256Regs, std::span VRegs) { if (SVE256Regs) { size_t i = 0; for (; i < (VRegs.size() % 4); i += 2) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; st2b(Reg1.Z(), Reg2.Z(), PRED_TMP_32B, TmpReg, 0); add(ARMEmitter::Size::i64Bit, TmpReg, TmpReg, 32 * 2); } for (; i < VRegs.size(); i += 4) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; const auto Reg3 = VRegs[i + 2]; const auto Reg4 = VRegs[i + 3]; st4b(Reg1.Z(), Reg2.Z(), Reg3.Z(), Reg4.Z(), PRED_TMP_32B, TmpReg, 0); add(ARMEmitter::Size::i64Bit, TmpReg, TmpReg, 32 * 4); } } else { size_t i = 0; for (; i < (VRegs.size() % 4); i += 2) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; st1(Reg1.Q(), Reg2.Q(), TmpReg, 32); } for (; i < VRegs.size(); i += 4) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; const auto Reg3 = VRegs[i + 2]; const auto Reg4 = VRegs[i + 3]; st1(Reg1.Q(), Reg2.Q(), Reg3.Q(), Reg4.Q(), TmpReg, 64); } } } void Arm64Emitter::PushGeneralRegisters(ARMEmitter::Register TmpReg, std::span Regs) { size_t i = 0; for (; i < (Regs.size() % 2); ++i) { const auto Reg1 = Regs[i]; str(Reg1.X(), TmpReg, 16); } for (; i < Regs.size(); i += 2) { const auto Reg1 = Regs[i]; const auto Reg2 = Regs[i + 1]; stp(Reg1.X(), Reg2.X(), TmpReg, 16); } } void Arm64Emitter::PopVectorRegisters(bool SVE256Regs, std::span VRegs) { if (SVE256Regs) { size_t i = 0; for (; i < (VRegs.size() % 4); i += 2) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; ld2b(Reg1.Z(), Reg2.Z(), PRED_TMP_32B.Zeroing(), ARMEmitter::Reg::rsp); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, 32 * 2); } for (; i < VRegs.size(); i += 4) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; const auto Reg3 = VRegs[i + 2]; const auto Reg4 = VRegs[i + 3]; ld4b(Reg1.Z(), Reg2.Z(), Reg3.Z(), Reg4.Z(), PRED_TMP_32B.Zeroing(), ARMEmitter::Reg::rsp); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, 32 * 4); } } else { size_t i = 0; for (; i < (VRegs.size() % 4); i += 2) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; ld1(Reg1.Q(), Reg2.Q(), ARMEmitter::Reg::rsp, 32); } for (; i < VRegs.size(); i += 4) { const auto Reg1 = VRegs[i]; const auto Reg2 = VRegs[i + 1]; const auto Reg3 = VRegs[i + 2]; const auto Reg4 = VRegs[i + 3]; ld1(Reg1.Q(), Reg2.Q(), Reg3.Q(), Reg4.Q(), ARMEmitter::Reg::rsp, 64); } } } void Arm64Emitter::PopGeneralRegisters(std::span Regs) { size_t i = 0; for (; i < (Regs.size() % 2); ++i) { const auto Reg1 = Regs[i]; ldr(Reg1.X(), ARMEmitter::Reg::rsp, 16); } for (; i < Regs.size(); i += 2) { const auto Reg1 = Regs[i]; const auto Reg2 = Regs[i + 1]; ldp(Reg1.X(), Reg2.X(), ARMEmitter::Reg::rsp, 16); } } size_t Arm64Emitter::PushDynamicRegs(ARMEmitter::Register TmpReg) { const auto CanUseSVE256 = EmitterCTX->HostFeatures.SupportsSVE256; const auto GPRSize = GeneralRegistersNotPreserved.size() * Core::CPUState::GPR_REG_SIZE; const auto FPRRegSize = CanUseSVE256 ? 32 : 16; const auto FPRSize = GeneralFPRegisters.size() * FPRRegSize; const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16); sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, SPOffset); // rsp capable move add(ARMEmitter::Size::i64Bit, TmpReg, ARMEmitter::Reg::rsp, 0); LOGMAN_THROW_A_FMT(GeneralFPRegisters.size() % 2 == 0, "Needs to have multiple of 2 FPRs for RA"); // Push the vector registers PushVectorRegisters(TmpReg, CanUseSVE256, GeneralFPRegisters); // Push the general registers. PushGeneralRegisters(TmpReg, GeneralRegistersNotPreserved); return SPOffset; } void Arm64Emitter::PopDynamicRegs() { const auto CanUseSVE256 = EmitterCTX->HostFeatures.SupportsSVE256; // Pop vectors first PopVectorRegisters(CanUseSVE256, GeneralFPRegisters); // Pop GPRs second PopGeneralRegisters(GeneralRegistersNotPreserved); } size_t Arm64Emitter::SpillForPreserveAllABICall(ARMEmitter::Register TmpReg, bool FPRs) { const auto CanUseSVE256 = EmitterCTX->HostFeatures.SupportsSVE256; const auto FPRRegSize = CanUseSVE256 ? 32 : 16; std::span DynamicGPRs {}; std::span DynamicFPRs {}; uint32_t PreserveSRAMask {}; uint32_t PreserveSRAFPRMask {}; if (EmitterCTX->Config.Is64BitMode()) { DynamicGPRs = x64::PreserveAll_Dynamic; DynamicFPRs = x64::PreserveAll_DynamicFPR; PreserveSRAMask = x64::PreserveAll_SRAMask; PreserveSRAFPRMask = x64::PreserveAll_SRAFPRMask; if (CanUseSVE256) { DynamicFPRs = x64::PreserveAll_DynamicFPRSVE; PreserveSRAFPRMask = x64::PreserveAll_SRAFPRSVEMask; } } else { DynamicGPRs = x32::PreserveAll_Dynamic; DynamicFPRs = x32::PreserveAll_DynamicFPR; PreserveSRAMask = x32::PreserveAll_SRAMask; PreserveSRAFPRMask = x32::PreserveAll_SRAFPRMask; if (CanUseSVE256) { DynamicFPRs = x32::PreserveAll_DynamicFPRSVE; PreserveSRAFPRMask = x32::PreserveAll_SRAFPRSVEMask; } } const auto GPRSize = AlignUp(DynamicGPRs.size(), 2) * Core::CPUState::GPR_REG_SIZE; const auto FPRSize = DynamicFPRs.size() * FPRRegSize; const uint64_t SPOffset = AlignUp(GPRSize + FPRSize, 16); // Spill the static registers. SpillStaticRegs(TmpReg, { .GPRSpillMask = PreserveSRAMask, .FPRSpillMask = PreserveSRAFPRMask, }); sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, SPOffset); // rsp capable move add(ARMEmitter::Size::i64Bit, TmpReg, ARMEmitter::Reg::rsp, 0); // Push the vector registers. PushVectorRegisters(TmpReg, CanUseSVE256, DynamicFPRs); // Push the general registers. PushGeneralRegisters(TmpReg, DynamicGPRs); return SPOffset; } void Arm64Emitter::FillForPreserveAllABICall(bool FPRs) { const auto CanUseSVE256 = EmitterCTX->HostFeatures.SupportsSVE256; std::span DynamicGPRs {}; std::span DynamicFPRs {}; uint32_t PreserveSRAMask {}; uint32_t PreserveSRAFPRMask {}; if (EmitterCTX->Config.Is64BitMode()) { DynamicGPRs = x64::PreserveAll_Dynamic; DynamicFPRs = x64::PreserveAll_DynamicFPR; PreserveSRAMask = x64::PreserveAll_SRAMask; PreserveSRAFPRMask = x64::PreserveAll_SRAFPRMask; if (CanUseSVE256) { DynamicFPRs = x64::PreserveAll_DynamicFPRSVE; PreserveSRAFPRMask = x64::PreserveAll_SRAFPRSVEMask; } } else { DynamicGPRs = x32::PreserveAll_Dynamic; DynamicFPRs = x32::PreserveAll_DynamicFPR; PreserveSRAMask = x32::PreserveAll_SRAMask; PreserveSRAFPRMask = x32::PreserveAll_SRAFPRMask; if (CanUseSVE256) { DynamicFPRs = x32::PreserveAll_DynamicFPRSVE; PreserveSRAFPRMask = x32::PreserveAll_SRAFPRSVEMask; } } // Fill the static registers. FillStaticRegs({ .GPRFillMask = PreserveSRAMask, .FPRFillMask = PreserveSRAFPRMask, .FPRs = FPRs, }); // Pop the vector registers. PopVectorRegisters(CanUseSVE256, DynamicFPRs); // Pop the general registers. PopGeneralRegisters(DynamicGPRs); } void Arm64Emitter::Align16B() { uint64_t CurrentOffset = GetCursorAddress(); for (uint64_t i = (-CurrentOffset & 0xF); i != 0; i -= 4) { nop(); } } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #ifdef VIXL_DISASSEMBLER #include #include #include #endif #ifdef VIXL_SIMULATOR #include #include #endif #include #include #include #include #include #include namespace FEXCore::Context { class ContextImpl; } namespace FEXCore::X86State { enum X86Reg : uint32_t; } namespace FEXCore::CPU { // Contains the address to the currently available CPU state constexpr auto STATE = ARMEmitter::XReg::x28; #ifndef ARCHITECTURE_arm64ec // GPR temporaries. Only x3 can be used across spill boundaries // so if these ever need to change, be very careful about that. constexpr auto TMP1 = ARMEmitter::XReg::x0; constexpr auto TMP2 = ARMEmitter::XReg::x1; constexpr auto TMP3 = ARMEmitter::XReg::x2; constexpr auto TMP4 = ARMEmitter::XReg::x3; constexpr bool TMP_ABIARGS = true; // We pin r26/r27 as PF/AF respectively, this is internal FEX ABI. constexpr auto REG_PF = ARMEmitter::Reg::r26; constexpr auto REG_AF = ARMEmitter::Reg::r27; constexpr auto REG_CALLRET_SP = ARMEmitter::XReg::x25; // Vector temporaries constexpr auto VTMP1 = ARMEmitter::VReg::v0; constexpr auto VTMP2 = ARMEmitter::VReg::v1; // Predicate register for X87 SVE Optimization constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2; #else constexpr auto TMP1 = ARMEmitter::XReg::x10; constexpr auto TMP2 = ARMEmitter::XReg::x11; constexpr auto TMP3 = ARMEmitter::XReg::x12; constexpr auto TMP4 = ARMEmitter::XReg::x13; constexpr bool TMP_ABIARGS = false; // We pin r11/r12 as PF/AF respectively for arm64ec, as r26/r27 are used for SRA. constexpr auto REG_PF = ARMEmitter::Reg::r9; constexpr auto REG_AF = ARMEmitter::Reg::r24; constexpr auto REG_CALLRET_SP = ARMEmitter::XReg::x17; // Vector temporaries constexpr auto VTMP1 = ARMEmitter::VReg::v16; constexpr auto VTMP2 = ARMEmitter::VReg::v17; // Entry/Exit ABI constexpr auto EC_CALL_CHECKER_PC_REG = ARMEmitter::XReg::x9; constexpr auto EC_ENTRY_CPUAREA_REG = ARMEmitter::XReg::x17; // Predicate register for X87 SVE Optimization constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2; // These structures are not included in the standard Windows headers, define the offsets of members we care about for EC here. constexpr size_t TEB_CPU_AREA_OFFSET = 0x1788; constexpr size_t TEB_PEB_OFFSET = 0x60; constexpr size_t PEB_EC_CODE_BITMAP_OFFSET = 0x368; constexpr size_t CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET = 0x1; constexpr size_t CPU_AREA_EMULATOR_STACK_BASE_OFFSET = 0x8; constexpr size_t CPU_AREA_EMULATOR_DATA_OFFSET = 0x30; constexpr uint64_t EC_CODE_BITMAP_MAX_ADDRESS = 1ULL << 47; #endif // Will force one single instruction block to be generated first if set when entering the JIT filling SRA. // FillStaticRegs must preserve this constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP2; // Predicate to use in the X87 SVE optimization constexpr ARMEmitter::PRegister PRED_X87_SVEOPT = ARMEmitter::PReg::p2; // Predicate register temporaries (used when AVX support is enabled) // PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1. // PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1. constexpr ARMEmitter::PRegister PRED_TMP_16B = ARMEmitter::PReg::p6; constexpr ARMEmitter::PRegister PRED_TMP_32B = ARMEmitter::PReg::p7; // This class contains common emitter utility functions that can // be used by both Arm64 JIT and ARM64 Dispatcher class Arm64Emitter : public ARMEmitter::Emitter { public: Arm64Emitter(FEXCore::Context::ContextImpl* ctx, void* EmissionPtr = nullptr, size_t size = 0); enum class PadType { // Explicitly does not need padding, even if code-caching is enabled. NOPAD, // Explicitly needs padding, even if code-caching is disabled. DOPAD, // Choose to pad or not depending on if code-caching is enabled. AUTOPAD, }; void LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, uint64_t Constant, PadType Pad = PadType::NOPAD, int MaxBytes = 0); protected: FEXCore::Context::ContextImpl* EmitterCTX; std::span StaticRegisters {}; std::span GeneralRegisters {}; std::span GeneralRegistersNotPreserved {}; std::span StaticFPRegisters {}; std::span GeneralFPRegisters {}; uint32_t PairRegisters = 0; void FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Register TmpReg2, bool SetFIZ, bool SetPredRegs); // Correlate an ARM register back to an x86 register index. // Returning REG_INVALID if there was no mapping. FEXCore::X86State::X86Reg GetX86RegRelationToARMReg(ARMEmitter::Register Reg); struct SpillStaticRegOptions final { uint32_t GPRSpillMask {~0U}; uint32_t FPRSpillMask {~0U}; bool FPRs {true}; bool NZCV {true}; }; struct FillStaticRegOptions final { std::optional OptionalReg {std::nullopt}; std::optional OptionalReg2 {std::nullopt}; uint32_t GPRFillMask {~0U}; uint32_t FPRFillMask {~0U}; bool FPRs {true}; bool NZCV {true}; }; void SpillStaticRegs(ARMEmitter::Register TmpReg, SpillStaticRegOptions Options); void FillStaticRegs(FillStaticRegOptions Options); void SpillStaticRegs(ARMEmitter::Register TmpReg) { // Work around a clang bug: https://bugs.llvm.org/show_bug.cgi?id=36684 SpillStaticRegs(TmpReg, {}); } void FillStaticRegs() { // Work around a clang bug: https://bugs.llvm.org/show_bug.cgi?id=36684 FillStaticRegs({}); } // Register 0-18 + 29 + 30 are caller saved static constexpr uint32_t CALLER_GPR_MASK = 0b0110'0000'0000'0111'1111'1111'1111'1111U; // This isn't technically true because the lower 64-bits of v8..v15 are callee saved // We can't guarantee only the lower 64bits are used so flush everything static constexpr uint32_t CALLER_FPR_MASK = ~0U; // Generic push and pop vector registers. void PushVectorRegisters(ARMEmitter::Register TmpReg, bool SVERegs, std::span VRegs); void PushGeneralRegisters(ARMEmitter::Register TmpReg, std::span Regs); void PopVectorRegisters(bool SVERegs, std::span VRegs); void PopGeneralRegisters(std::span Regs); // Returns stack size consumed for pushing dynamic registers. size_t PushDynamicRegs(ARMEmitter::Register TmpReg); void PopDynamicRegs(); void PushCalleeSavedRegisters(); void PopCalleeSavedRegisters(); // Spills and fills SRA/Dynamic registers that are required for Arm64 `preserve_all` ABI. // This ABI changes most registers to be callee saved. // Caller Saved: // - X0-X8, X16-X18, X30. // - v0-v7 // - For 256-bit SVE hosts: top 128-bits of v8-v31 // // Callee Saved: // - X9-X15, X19-X29, X31 // - Low 128-bits of v8-v31 size_t SpillForPreserveAllABICall(ARMEmitter::Register TmpReg, bool FPRs = true); void FillForPreserveAllABICall(bool FPRs = true); size_t SpillForABICall(bool SupportsPreserveAllABI, ARMEmitter::Register TmpReg, bool FPRs = true) { if (SupportsPreserveAllABI) { return SpillForPreserveAllABICall(TmpReg, FPRs); } else { SpillStaticRegs(TmpReg, { .FPRs = FPRs, }); return PushDynamicRegs(TmpReg); } } void FillForABICall(bool SupportsPreserveAllABI, bool FPRs = true) { if (SupportsPreserveAllABI) { FillForPreserveAllABICall(FPRs); } else { PopDynamicRegs(); FillStaticRegs({.FPRs = FPRs}); } } void Align16B(); #ifdef VIXL_SIMULATOR // Generates a vixl simulator runtime call. // // This matches behaviour of vixl's macro assembler, but we need to reimplement it since we aren't using the macro assembler. // This isn't too complex with how vixl emits this. // // Emit: // 1) hlt(kRuntimeCallOpcode) // 2) Simulator wrapper handler // 3) Function to call // 4) Style of the function call (Call versus tail-call) template void GenerateRuntimeCall(R (*Function)(P...)) { uintptr_t SimulatorWrapperAddress = reinterpret_cast(&(vixl::aarch64::Simulator::RuntimeCallStructHelper::Wrapper)); uintptr_t FunctionAddress = reinterpret_cast(Function); hlt(vixl::aarch64::kRuntimeCallOpcode); // Simulator wrapper address pointer. dc64(SimulatorWrapperAddress); // Runtime function address to call dc64(FunctionAddress); // Call type dc32(vixl::aarch64::kCallRuntime); } template void GenerateIndirectRuntimeCall(ARMEmitter::Register Reg) { uintptr_t SimulatorWrapperAddress = reinterpret_cast(&(vixl::aarch64::Simulator::RuntimeCallStructHelper::Wrapper)); hlt(vixl::aarch64::kIndirectRuntimeCallOpcode); // Simulator wrapper address pointer. dc64(SimulatorWrapperAddress); // Register that contains the function to call dc32(Reg.Idx()); // Call type dc32(vixl::aarch64::kCallRuntime); } template<> void GenerateIndirectRuntimeCall(ARMEmitter::Register Reg) { uintptr_t SimulatorWrapperAddress = reinterpret_cast(&(vixl::aarch64::Simulator::RuntimeCallStructHelper::Wrapper)); hlt(vixl::aarch64::kIndirectRuntimeCallOpcode); // Simulator wrapper address pointer. dc64(SimulatorWrapperAddress); // Register that contains the function to call dc32(Reg.Idx()); // Call type dc32(vixl::aarch64::kCallRuntime); } #else template void GenerateRuntimeCall(R (*Function)(P...)) { // Explicitly doing nothing. } template void GenerateIndirectRuntimeCall(ARMEmitter::Register Reg) { // Explicitly doing nothing. } #endif #ifdef VIXL_SIMULATOR vixl::aarch64::Decoder SimDecoder; vixl::aarch64::Simulator Simulator; constexpr static size_t SimulatorStackSize = 8 * 1024 * 1024; #endif #ifdef VIXL_DISASSEMBLER fextl::vector DisasmBuffer; constexpr static int DISASM_BUFFER_SIZE {256}; fextl::unique_ptr Disasm; fextl::unique_ptr DisasmDecoder; FEX_CONFIG_OPT(Disassemble, DISASSEMBLE); #endif FEX_CONFIG_OPT(EnableCodeCaching, ENABLECODECACHINGWIP); }; } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/CPUBackend.cpp ================================================ // SPDX-License-Identifier: MIT #include "FEXCore/Config/Config.h" #include "Interface/Context/Context.h" #include "Interface/Core/CPUBackend.h" #include "Interface/Core/LookupCache.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include #include #include #include #ifndef _WIN32 #include #include #endif namespace FEXCore { namespace CPU { static constexpr size_t INITIAL_CODE_SIZE = 1024 * 1024 * 16; // We don't want to move above 128MB atm because that means we will have to encode longer jumps static constexpr size_t MAX_CODE_SIZE = 1024 * 1024 * 128; constexpr static uint64_t NamedVectorConstants[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_CONST_POOL_MAX][2] = { {0x0003'0002'0001'0000ULL, 0x0007'0006'0005'0004ULL}, // NAMED_VECTOR_INCREMENTAL_U16_INDEX {0x000B'000A'0009'0008ULL, 0x000F'000E'000D'000CULL}, // NAMED_VECTOR_INCREMENTAL_U16_INDEX_UPPER {0x0000'0000'8000'0000ULL, 0x0000'0000'8000'0000ULL}, // NAMED_VECTOR_PADDSUBPS_INVERT {0x0000'0000'8000'0000ULL, 0x0000'0000'8000'0000ULL}, // NAMED_VECTOR_PADDSUBPS_INVERT_UPPER {0x8000'0000'0000'0000ULL, 0x0000'0000'0000'0000ULL}, // NAMED_VECTOR_PADDSUBPD_INVERT {0x8000'0000'0000'0000ULL, 0x0000'0000'0000'0000ULL}, // NAMED_VECTOR_PADDSUBPD_INVERT_UPPER {0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL}, // NAMED_VECTOR_PSUBADDPS_INVERT {0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL}, // NAMED_VECTOR_PSUBADDPS_INVERT_UPPER {0x0000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL}, // NAMED_VECTOR_PSUBADDPD_INVERT {0x0000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL}, // NAMED_VECTOR_PSUBADDPD_INVERT_UPPER {0x0000'0001'0000'0000ULL, 0x0000'0003'0000'0002ULL}, // NAMED_VECTOR_MOVMSKPS_SHIFT {0x040B'0E01'0B0E'0104ULL, 0x0C03'0609'0306'090CULL}, // NAMED_VECTOR_AESKEYGENASSIST_SWIZZLE {0x0706'0504'FFFF'FFFFULL, 0xFFFF'FFFF'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_0110B {0x0706'0504'0302'0100ULL, 0xFFFF'FFFF'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_0111B {0xFFFF'FFFF'0302'0100ULL, 0x0F0E'0D0C'FFFF'FFFFULL}, // NAMED_VECTOR_BLENDPS_1001B {0x0706'0504'0302'0100ULL, 0x0F0E'0D0C'FFFF'FFFFULL}, // NAMED_VECTOR_BLENDPS_1011B {0xFFFF'FFFF'0302'0100ULL, 0x0F0E'0D0C'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_1101B {0x0706'0504'FFFF'FFFFULL, 0x0F0E'0D0C'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_1110B {0x8040'2010'0804'0201ULL, 0x8040'2010'0804'0201ULL}, // NAMED_VECTOR_MOVMASKB {0x8040'2010'0804'0201ULL, 0x8040'2010'0804'0201ULL}, // NAMED_VECTOR_MOVMASKB_UPPER {0x8000'0000'0000'0000ULL, 0x0000'0000'0000'3FFFULL}, // NAMED_VECTOR_X87_ONE {0xD49A'784B'CD1B'8AFEULL, 0x0000'0000'0000'4000ULL}, // NAMED_VECTOR_X87_LOG2_10 {0xB8AA'3B29'5C17'F0BCULL, 0x0000'0000'0000'3FFFULL}, // NAMED_VECTOR_X87_LOG2_E {0xC90F'DAA2'2168'C235ULL, 0x0000'0000'0000'4000ULL}, // NAMED_VECTOR_X87_PI {0x9A20'9A84'FBCF'F799ULL, 0x0000'0000'0000'3FFDULL}, // NAMED_VECTOR_X87_LOG10_2 {0xB172'17F7'D1CF'79ACULL, 0x0000'0000'0000'3FFEULL}, // NAMED_VECTOR_X87_LOG_2 {0x4F00'0000'4F00'0000ULL, 0x4F00'0000'4F00'0000ULL}, // NAMED_VECTOR_CVTMAX_F32_I32 {0x4F00'0000'4F00'0000ULL, 0x4F00'0000'4F00'0000ULL}, // NAMED_VECTOR_CVTMAX_F32_I32_UPPER {0x5F00'0000'5F00'0000ULL, 0x5F00'0000'5F00'0000ULL}, // NAMED_VECTOR_CVTMAX_F32_I64 {0x41E0'0000'0000'0000ULL, 0x41E0'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_F64_I32 {0x41E0'0000'0000'0000ULL, 0x41E0'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_F64_I32_UPPER {0x43E0'0000'0000'0000ULL, 0x43E0'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_F64_I64 {0x8000'0000'8000'0000ULL, 0x8000'0000'8000'0000ULL}, // NAMED_VECTOR_CVTMAX_I32 {0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_I64 {0x0000'0000'0000'0000ULL, 0x0000'0000'0000'8000ULL}, // NAMED_VECTOR_F80_SIGN_MASK {0x5A82'7999'5A82'7999ULL, 0x5A82'7999'5A82'7999ULL}, // NAMED_VECTOR_SHA1RNDS_K0 {0x6ED9'EBA1'6ED9'EBA1ULL, 0x6ED9'EBA1'6ED9'EBA1ULL}, // NAMED_VECTOR_SHA1RNDS_K1 {0x8F1B'BCDC'8F1B'BCDCULL, 0x8F1B'BCDC'8F1B'BCDCULL}, // NAMED_VECTOR_SHA1RNDS_K2 {0xCA62'C1D6'CA62'C1D6ULL, 0xCA62'C1D6'CA62'C1D6ULL}, // NAMED_VECTOR_SHA1RNDS_K3 }; constexpr static auto PSHUFLW_LUT {[]() consteval { struct LUTType { uint64_t Val[2]; }; // Expectation for this LUT is to simulate PSHUFLW with ARM's TBL (single register) instruction // PSHUFLW behaviour: // 16-bit words in [63:48], [47:32], [31:16], [15:0] are selected using the 8-bit Index. // For 128-bit PSHUFLW, bits [127:64] are identity copied. constexpr uint64_t IdentityCopyUpper = 0x0f'0e'0d'0c'0b'0a'09'08; std::array TotalLUT {}; uint64_t WordSelection[4] = { 0x01'00, 0x03'02, 0x05'04, 0x07'06, }; for (size_t i = 0; i < 256; ++i) { auto& LUT = TotalLUT[i]; const auto Word0 = (i >> 0) & 0b11; const auto Word1 = (i >> 2) & 0b11; const auto Word2 = (i >> 4) & 0b11; const auto Word3 = (i >> 6) & 0b11; LUT.Val[0] = (WordSelection[Word0] << 0) | (WordSelection[Word1] << 16) | (WordSelection[Word2] << 32) | (WordSelection[Word3] << 48); LUT.Val[1] = IdentityCopyUpper; } return TotalLUT; }()}; constexpr static auto PSHUFHW_LUT {[]() consteval { struct LUTType { uint64_t Val[2]; }; // Expectation for this LUT is to simulate PSHUFHW with ARM's TBL (single register) instruction // PSHUFHW behaviour: // 16-bit words in [127:112], [111:96], [95:80], [79:64] are selected using the 8-bit Index. // Incoming words come from bits [127:64] of the source. // Bits [63:0] are identity copied. constexpr uint64_t IdentityCopyLower = 0x07'06'05'04'03'02'01'00; std::array TotalLUT {}; uint64_t WordSelection[4] = { 0x09'08, 0x0b'0a, 0x0d'0c, 0x0f'0e, }; for (size_t i = 0; i < 256; ++i) { auto& LUT = TotalLUT[i]; const auto Word0 = (i >> 0) & 0b11; const auto Word1 = (i >> 2) & 0b11; const auto Word2 = (i >> 4) & 0b11; const auto Word3 = (i >> 6) & 0b11; LUT.Val[0] = IdentityCopyLower; LUT.Val[1] = (WordSelection[Word0] << 0) | (WordSelection[Word1] << 16) | (WordSelection[Word2] << 32) | (WordSelection[Word3] << 48); } return TotalLUT; }()}; constexpr static auto PSHUFD_LUT {[]() consteval { struct LUTType { uint64_t Val[2]; }; // Expectation for this LUT is to simulate PSHUFD with ARM's TBL (single register) instruction // PSHUFD behaviour: // 32-bit words in [127:96], [95:64], [63:32], [31:0] are selected using the 8-bit Index. std::array TotalLUT {}; uint64_t WordSelection[4] = { 0x03'02'01'00, 0x07'06'05'04, 0x0b'0a'09'08, 0x0f'0e'0d'0c, }; for (size_t i = 0; i < 256; ++i) { auto& LUT = TotalLUT[i]; const auto Word0 = (i >> 0) & 0b11; const auto Word1 = (i >> 2) & 0b11; const auto Word2 = (i >> 4) & 0b11; const auto Word3 = (i >> 6) & 0b11; LUT.Val[0] = (WordSelection[Word0] << 0) | (WordSelection[Word1] << 32); LUT.Val[1] = (WordSelection[Word2] << 0) | (WordSelection[Word3] << 32); } return TotalLUT; }()}; constexpr static auto SHUFPS_LUT {[]() consteval { struct LUTType { uint64_t Val[2]; }; // 32-bit words in [127:96], [95:64], [63:32], [31:0] are selected using the 8-bit Index. // Expectation for this LUT is to simulate SHUFPS with ARM's TBL (two register) instruction. // SHUFPS behaviour: // Two 32-bits words from each source are selected from each source in the lower and upper halves of the 128-bit destination. // Dest[31:0] = Src1[] // Dest[63:32] = Src1[] // Dest[95:64] = Src2[] // Dest[127:96] = Src2[] std::array TotalLUT {}; const uint64_t WordSelectionSrc1[4] = { 0x03'02'01'00, 0x07'06'05'04, 0x0b'0a'09'08, 0x0f'0e'0d'0c, }; // Src2 needs to offset each byte index by 16-bytes to pull from the second source. const uint64_t WordSelectionSrc2[4] = { 0x03'02'01'00 + (0x10101010), 0x07'06'05'04 + (0x10101010), 0x0b'0a'09'08 + (0x10101010), 0x0f'0e'0d'0c + (0x10101010), }; for (size_t i = 0; i < 256; ++i) { auto& LUT = TotalLUT[i]; const auto Word0 = (i >> 0) & 0b11; const auto Word1 = (i >> 2) & 0b11; const auto Word2 = (i >> 4) & 0b11; const auto Word3 = (i >> 6) & 0b11; LUT.Val[0] = (WordSelectionSrc1[Word0] << 0) | (WordSelectionSrc1[Word1] << 32); LUT.Val[1] = (WordSelectionSrc2[Word2] << 0) | (WordSelectionSrc2[Word3] << 32); } return TotalLUT; }()}; constexpr static auto DPPS_MASK {[]() consteval { struct LUTType { uint32_t Val[4]; }; std::array TotalLUT {}; for (size_t i = 0; i < TotalLUT.size(); ++i) { auto& LUT = TotalLUT[i]; constexpr auto GetLUT = [](size_t i, size_t Index) { if (i & (1U << Index)) { return -1U; } return 0U; }; LUT.Val[0] = GetLUT(i, 0); LUT.Val[1] = GetLUT(i, 1); LUT.Val[2] = GetLUT(i, 2); LUT.Val[3] = GetLUT(i, 3); } return TotalLUT; }()}; constexpr static auto DPPD_MASK {[]() consteval { struct LUTType { uint64_t Val[2]; }; std::array TotalLUT {}; for (size_t i = 0; i < TotalLUT.size(); ++i) { auto& LUT = TotalLUT[i]; constexpr auto GetLUT = [](size_t i, size_t Index) { if (i & (1U << Index)) { return -1ULL; } return 0ULL; }; LUT.Val[0] = GetLUT(i, 0); LUT.Val[1] = GetLUT(i, 1); } return TotalLUT; }()}; constexpr static auto PBLENDW_LUT {[]() consteval { struct LUTType { uint16_t Val[8]; }; // 16-bit words in [127:112], [111:96], [95:80], [79:64], [63:48], [47:32], [31:16], [15:0] are selected using 8-bit swizzle. // Expectation for this LUT is to simulate PBLENDW with ARM's TBX (one register) instruction. // PBLENDW behaviour: // 16-bit words from the source is moved in to the destination based on the bit in the swizzle. // Dest[15:0] = Swizzle[0] ? Src[15:0] : Dest[15:0] // Dest[31:16] = Swizzle[1] ? Src[31:16] : Dest[31:16] // Dest[47:32] = Swizzle[2] ? Src[47:32] : Dest[47:32] // Dest[63:48] = Swizzle[3] ? Src[63:48] : Dest[63:48] // Dest[79:64] = Swizzle[4] ? Src[79:64] : Dest[79:64] // Dest[95:80] = Swizzle[5] ? Src[95:80] : Dest[95:80] // Dest[111:96] = Swizzle[6] ? Src[111:96] : Dest[111:96] // Dest[127:112] = Swizzle[7] ? Src[127:112] : Dest[127:112] std::array TotalLUT {}; const uint16_t WordSelectionSrc[8] = { 0x01'00, 0x03'02, 0x05'04, 0x07'06, 0x09'08, 0x0B'0A, 0x0D'0C, 0x0F'0E, }; constexpr uint16_t OriginalDest = 0xFF'FF; for (size_t i = 0; i < 256; ++i) { auto& LUT = TotalLUT[i]; for (size_t j = 0; j < 8; ++j) { LUT.Val[j] = ((i >> j) & 1) ? WordSelectionSrc[j] : OriginalDest; } } return TotalLUT; }()}; CPUBackend::CPUBackend(CodeBufferManager& CodeBuffers, FEXCore::Core::InternalThreadState* ThreadState) : ThreadState(ThreadState) , CodeBuffers(CodeBuffers) { auto& Ptrs = ThreadState->CurrentFrame->Pointers; // Initialize named vector constants. for (size_t i = 0; i < FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_CONST_POOL_MAX; ++i) { Ptrs.NamedVectorConstantPointers[i] = reinterpret_cast(NamedVectorConstants[i]); } // Copy named vector constants. memcpy(Ptrs.NamedVectorConstants, NamedVectorConstants, sizeof(NamedVectorConstants)); // Initialize Indexed named vector constants. Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW] = reinterpret_cast(PSHUFLW_LUT.data()); Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW] = reinterpret_cast(PSHUFHW_LUT.data()); Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD] = reinterpret_cast(PSHUFD_LUT.data()); Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_SHUFPS] = reinterpret_cast(SHUFPS_LUT.data()); Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPS_MASK] = reinterpret_cast(DPPS_MASK.data()); Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPD_MASK] = reinterpret_cast(DPPD_MASK.data()); Ptrs.IndexedNamedVectorConstantPointers[FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PBLENDW] = reinterpret_cast(PBLENDW_LUT.data()); #ifndef FEX_DISABLE_TELEMETRY // Fill in telemetry values for (size_t i = 0; i < FEXCore::Telemetry::TYPE_LAST; ++i) { auto& Telem = FEXCore::Telemetry::GetTelemetryValue(static_cast(i)); Ptrs.TelemetryValueAddresses[i] = reinterpret_cast(&Telem); } #endif } CPUBackend::~CPUBackend() = default; auto CPUBackend::GetEmptyCodeBuffer() -> CodeBuffer* { auto PrevCodeBuffer = CurrentCodeBuffer; // Resize the code buffer and reallocate our code size CurrentCodeBuffer = CodeBuffers.StartLargerCodeBuffer(); RegisterForSignalHandler(std::move(PrevCodeBuffer)); return CurrentCodeBuffer.get(); } void CPUBackend::RegisterForSignalHandler(fextl::shared_ptr CodeBuffer) { if (ThreadState->CurrentFrame->SignalHandlerRefCounter != 0) { // We have signal handlers that have generated code // This means that we can not safely clear the code at this point in time // Keep a reference to the old code buffer to delay deallocation SignalHandlerCodeBuffers.push_back(std::move(CodeBuffer)); } else { SignalHandlerCodeBuffers.clear(); } } fextl::shared_ptr CPUBackend::CheckCodeBufferUpdate() { auto NewCodeBuffer = CodeBuffers.GetLatest(); if (CurrentCodeBuffer != NewCodeBuffer) { RegisterForSignalHandler(CurrentCodeBuffer); return std::exchange(CurrentCodeBuffer, NewCodeBuffer); } return nullptr; } GuestToHostMap& GetLookupCache(const CodeBuffer& Buffer) { return *Buffer.LookupCache; } CodeBuffer::CodeBuffer(size_t Size) : AllocatedSize(Size) { Ptr = static_cast(FEXCore::Allocator::VirtualAlloc(Size, true)); LOGMAN_THROW_A_FMT(!!Ptr, "Couldn't allocate code buffer"); // Protect the last page of the allocated buffer to trigger SIGSEGV on write access uintptr_t LastPageAddr = AlignDown(reinterpret_cast(Ptr) + Size - 1, FEXCore::Utils::FEX_PAGE_SIZE); if (!FEXCore::Allocator::VirtualProtect(reinterpret_cast(LastPageAddr), FEXCore::Utils::FEX_PAGE_SIZE, FEXCore::Allocator::ProtectOptions::None)) { LogMan::Msg::EFmt("Failed to mprotect last page of code buffer."); } FEXCore::Allocator::VirtualName("FEXMemJIT", reinterpret_cast(Ptr), Size); // Huge-pages reduce the amount of iTLB misses dramatically when it works. FEXCore::Allocator::VirtualTHPControl(reinterpret_cast(Ptr), Size, FEXCore::Allocator::THPControl::Enable); LookupCache = fextl::make_unique(); } CodeBuffer::~CodeBuffer() { FEXCore::Allocator::VirtualFree(Ptr, AllocatedSize); } auto CodeBufferManager::AllocateNew(size_t Size) -> fextl::shared_ptr { #ifndef _WIN32 // MDWE (Memory-Deny-Write-Execute) is a new Linux 6.3 feature. // It's equivalent to systemd's `MemoryDenyWriteExecute` but implemented entirely in the kernel. // // MDWE prevents applications from creating RWX memory mappings. // This prevents FEX from doing anything JIT related, as FEX uses RWX for JIT memory mappings. // // A potential workaround to make FEX work with MDWE is to call mprotect every time we need to write or modify code. // Alternatively, FEX could use a memory mirror where one half is mapped as RW and the other is RX. // // Once MDWE is enabled with the prctl, the feature is sealed and it can /NOT/ be turned off. // // Status of MDWE is queried through prctl using `PR_GET_MDWE`: // -1: The kernel doesn't support MDWE // 0: MDWE is supported but disabled // >0: MDWE is enabled, hence prohibiting RWX mappings #ifndef PR_GET_MDWE #define PR_GET_MDWE 66 #endif int MDWE = ::prctl(PR_GET_MDWE, 0, 0, 0, 0); if (MDWE != -1 && MDWE != 0) { LogMan::Msg::EFmt("MDWE was set to 0x{:x} which means FEX can't allocate executable memory", MDWE); } #endif auto Buffer = fextl::make_shared(Size); Latest = Buffer; LatestOffset = 0; OnCodeBufferAllocated(Buffer); return Buffer; } fextl::shared_ptr CodeBufferManager::GetLatest() { if (!Latest) { if (FEXCore::Config::Get_ENABLECODECACHINGWIP()) { // Start with a larger code buffer to avoid resizes that would discard // code loaded from caches AllocateNew(MAX_CODE_SIZE); } else { AllocateNew(INITIAL_CODE_SIZE); } } return Latest; } fextl::shared_ptr CodeBufferManager::StartLargerCodeBuffer() { if (!Latest) { // Allocate initial CodeBuffer and return it return GetLatest(); } auto NewCodeBufferSize = GetLatest()->AllocatedSize; NewCodeBufferSize = std::min(NewCodeBufferSize * 2, MAX_CODE_SIZE); return AllocateNew(NewCodeBufferSize); } bool CPUBackend::IsAddressInCodeBuffer(uintptr_t Address) const { auto CheckCodeBuffer = [](CodeBuffer& Buffer, uintptr_t Address) { // The last page of the code buffer is protected, so we need to exclude it from the valid range // when checking if the address is in the code buffer. uintptr_t LastPageAddr = AlignDown(reinterpret_cast(Buffer.Ptr) + Buffer.AllocatedSize - 1, FEXCore::Utils::FEX_PAGE_SIZE); return (Address >= reinterpret_cast(Buffer.Ptr) && Address < LastPageAddr); }; if (CheckCodeBuffer(*CurrentCodeBuffer, Address)) { return true; } for (auto& Buffer : SignalHandlerCodeBuffers) { if (CheckCodeBuffer(*Buffer, Address)) { return true; } } return false; } } // namespace CPU } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Core/CPUBackend.h ================================================ // SPDX-License-Identifier: MIT /* $info$ category: backend ~ IR to host code generation tags: backend|shared $end_info$ */ #pragma once #include #include #include #include #include #include #include namespace FEXCore::CPU { union Relocation; } namespace FEXCore { namespace IR { class IRListView; } // namespace IR namespace Core { struct DebugData; struct ThreadState; struct CpuStateFrame; struct InternalThreadState; } // namespace Core namespace CodeSerialize { struct CodeObjectFileSection; } struct GuestToHostMap; namespace CPU { struct CodeBuffer { uint8_t* Ptr; size_t AllocatedSize; // including guard page; see UsableSize() fextl::unique_ptr LookupCache; CodeBuffer(size_t Size); CodeBuffer(const CodeBuffer&) = delete; CodeBuffer& operator=(const CodeBuffer&) = delete; CodeBuffer(CodeBuffer&& oth) = delete; CodeBuffer& operator=(CodeBuffer&&) = delete; ~CodeBuffer(); /// Returns the number of bytes available for storing code size_t UsableSize() const { return AllocatedSize - FEXCore::Utils::FEX_PAGE_SIZE; } }; /** * A manager that coordinates access to the CodeBuffer used for compiling new code across threads. * * The CodeBuffer is managed as a partially persistent data structure: * - Exactly one CodeBuffer is now designated as "active", which means data can be appended to it * - Lossy modifications to the active CodeBuffer will not invalidate any data in use by other threads (which is what enables save CodeBuffer sharing across threads) * - Instead, such lossy modifications trigger a new "version" of the data in the modifying thread. Old versions of the CodeBuffer persist as read-only data for use by the other threads. * - The other threads can update their version of the CodeBuffer. This will decrease the reference count and eventually trigger deallocation of the old version */ class CodeBufferManager { public: // Get the CodeBuffer that was most recently allocated. // This is the only CodeBuffer that data may be written to. fextl::shared_ptr GetLatest(); // Allocate a new CodeBuffer with geometric growth up to an internal maximum. // Subsequent calls to GetLatest will point to the returned buffer. fextl::shared_ptr StartLargerCodeBuffer(); // Write offset into the latest CodeBuffer std::size_t LatestOffset {}; // Protects writes to the latest CodeBuffer and changes to LatestOffset FEXCore::ForkableUniqueMutex CodeBufferWriteMutex; virtual void OnCodeBufferAllocated(const std::shared_ptr&) {}; private: fextl::shared_ptr Latest; fextl::shared_ptr AllocateNew(size_t Size); }; class CPUBackend { public: CPUBackend(CodeBufferManager&, FEXCore::Core::InternalThreadState*); virtual ~CPUBackend(); struct CompiledCode { // Where this code block begins. uint8_t* BlockBegin; fextl::map EntryPoints; // The total size of the codeblock from [BlockBegin, BlockBegin+Size). size_t Size; }; // Header that can live at the start of a JIT block. // We want the header to be quite small, with most data living in the tail object. struct JITCodeHeader { // Offset from the start of this header to where the tail lives. // Only 32-bit since the tail block won't ever be more than 4GB away. uint32_t OffsetToBlockTail; }; // Header that can live at the end of the JIT block. // For any state reconstruction or other data, this is where it should live. // Any data that is explicitly tied to the JIT code and needs to be cached with it // should end up in this data structure. struct JITCodeTail { // The total size of the codeblock from [BlockBegin, BlockBegin+Size). size_t Size; // RIP that the block's entry comes from. uint64_t RIP; // The length of the guest code for this block. size_t GuestSize; // Number of RIP entries for this JIT Code section. uint32_t NumberOfRIPEntries; // Offset after this block to the start of the RIP entries. uint32_t OffsetToRIPEntries; // Shared-code modification spin-loop futex. uint32_t SpinLockFutex; // If this block represents a single guest instruction. bool SingleInst; uint8_t _Pad[3]; }; /** * @brief Tells this CPUBackend to compile code for the provided IR and DebugData * * The returned pointer needs to be long lived and be executable in the host environment * FEXCore's frontend will store this pointer in to a cache for the current RIP when this was executed * * This is a thread specific compilation unit since there is one CPUBackend per guest thread * * @param Size - The byte size of the guest code for this block * @param SingleInst - If this block represents a single guest instruction * @param IR - IR that maps to the IR for this RIP * @param DebugData - Debug data that is available for this IR indirectly * @param CheckTF - If EFLAGS.TF checks should be emitted at the start of the block * * @return Information about the compiled code block. */ [[nodiscard]] virtual CompiledCode CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, bool CheckTF) = 0; virtual fextl::vector TakeRelocations(uint64_t GuestBaseAddress) = 0; virtual void ClearCache() {} /** * @brief Clear any relocations after JIT compiling */ virtual void ClearRelocations() {} bool IsAddressInCodeBuffer(uintptr_t Address) const; // Updates the CodeBuffer if needed and returns a reference to the old one. // The returned reference should be kept alive carefully to avoid early deletion of resources. [[nodiscard]] fextl::shared_ptr CheckCodeBufferUpdate(); protected: // Max spill slot size in bytes. We need at most 32 bytes // to be able to handle a 256-bit vector store to a slot. constexpr static uint32_t MaxSpillSlotSize = 32; FEXCore::Core::InternalThreadState* ThreadState; [[nodiscard]] CodeBuffer* GetEmptyCodeBuffer(); // This is the code buffer containing the main code under execution by this thread. // CheckCodeBufferUpdate must be used before compiling new code. fextl::shared_ptr CurrentCodeBuffer; // Old CodeBuffer generations required to be valid until returning from signal handlers fextl::vector> SignalHandlerCodeBuffers; CodeBufferManager& CodeBuffers; private: void RegisterForSignalHandler(fextl::shared_ptr); }; } // namespace CPU } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Core/CPUID.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: opcodes|cpuid desc: Handles presented capability bits for guest cpu $end_info$ */ #include "Common/StringConv.h" #include "Interface/Context/Context.h" #include "Interface/Core/CPUID.h" #include #include #include #include #include #include #include #include "git_version.h" #include namespace FEXCore { namespace ProductNames { #ifdef ARCHITECTURE_arm64 static const char ARM_UNKNOWN[] = "Unknown ARM CPU"; static const char ARM_A57[] = "Cortex-A57"; static const char ARM_A72[] = "Cortex-A72"; static const char ARM_A73[] = "Cortex-A73"; static const char ARM_A75[] = "Cortex-A75"; static const char ARM_A76[] = "Cortex-A76"; static const char ARM_A76AE[] = "Cortex-A76AE"; static const char ARM_V1[] = "Neoverse V1"; static const char ARM_V2[] = "Neoverse V2"; static const char ARM_V3[] = "Neoverse V3"; static const char ARM_V3AE[] = "Neoverse V3AE"; static const char ARM_A77[] = "Cortex-A77"; static const char ARM_A78[] = "Cortex-A78"; static const char ARM_A78AE[] = "Cortex-A78AE"; static const char ARM_A78C[] = "Cortex-A78C"; static const char ARM_A710[] = "Cortex-A710"; static const char ARM_A715[] = "Cortex-A715"; static const char ARM_A720[] = "Cortex-A720"; static const char ARM_A725[] = "Cortex-A725"; static const char ARM_C1Pro[] = "C1-Pro"; static const char ARM_C1Premium[] = "C1-Premium"; static const char ARM_X1[] = "Cortex-X1"; static const char ARM_X1C[] = "Cortex-X1C"; static const char ARM_X2[] = "Cortex-X2"; static const char ARM_X3[] = "Cortex-X3"; static const char ARM_X4[] = "Cortex-X4"; static const char ARM_X925[] = "Cortex-X925"; static const char ARM_C1Ultra[] = "C1-Ultra"; static const char ARM_N1[] = "Neoverse N1"; static const char ARM_N2[] = "Neoverse N2"; static const char ARM_N3[] = "Neoverse N3"; static const char ARM_E1[] = "Neoverse E1"; static const char ARM_A35[] = "Cortex-A35"; static const char ARM_A53[] = "Cortex-A53"; static const char ARM_A55[] = "Cortex-A55"; static const char ARM_A65[] = "Cortex-A65"; static const char ARM_A510[] = "Cortex-A510"; static const char ARM_A520[] = "Cortex-A520"; static const char ARM_C1Nano[] = "C1-Nano"; static const char ARM_Kryo200[] = "Kryo 2xx"; static const char ARM_Kryo300[] = "Kryo 3xx"; static const char ARM_Kryo400[] = "Kryo 4xx/5xx"; static const char ARM_Kryo200S[] = "Kryo 2xx S"; static const char ARM_Kryo300S[] = "Kryo 3xx S"; static const char ARM_Kryo400S[] = "Kryo 4xx/5xx S"; static const char ARM_Denver[] = "Nvidia Denver"; static const char ARM_Carmel[] = "Nvidia Carmel"; static const char ARM_Olympus[] = "Nvidia Olympus"; static const char ARM_Firestorm_M1[] = "Apple Firestorm (M1)"; static const char ARM_Icestorm_M1[] = "Apple Icestorm (M1)"; static const char ARM_Firestorm_M1Pro[] = "Apple Firestorm (M1 Pro)"; static const char ARM_Icestorm_M1Pro[] = "Apple Icestorm (M1 Pro)"; static const char ARM_Firestorm_M1Max[] = "Apple Firestorm (M1 Max)"; static const char ARM_Icestorm_M1Max[] = "Apple Icestorm (M1 Max)"; static const char ARM_Avalanche_M2[] = "Apple Avalanche (M2)"; static const char ARM_Blizzard_M2[] = "Apple Blizzard (M2)"; static const char ARM_Avalanche_M2Pro[] = "Apple Avalanche (M2 Pro)"; static const char ARM_Blizzard_M2Pro[] = "Apple Blizzard (M2 Pro)"; static const char ARM_Avalanche_M2Max[] = "Apple Avalanche (M2 Max)"; static const char ARM_Blizzard_M2Max[] = "Apple Blizzard (M2 Max)"; static const char ARM_AppleSilicon[] = "Apple Silicon"; static const char ARM_ORYON_1[] = "Oryon-1"; static const char ARM_Ampere_1[] = "AmpereOne"; static const char ARM_Ampere_1A[] = "AmpereOneA"; static const char ARM_Ampere_1B[] = "AmpereOneB"; static const char ARM_Ampere_1C[] = "AmpereOneC"; #endif } // namespace ProductNames uint32_t GetCPUID_Syscall() { uint32_t CPU {}; FHU::Syscalls::getcpu(&CPU, nullptr); return CPU; } struct CPUFamily { uint32_t Stepping : 4; uint32_t Model : 4; uint32_t ExtendedModel : 4; uint32_t FamilyID : 4; uint32_t ExtendedFamilyID : 8; uint32_t ProcessorType : 4; }; constexpr static uint32_t GenerateFamily(const CPUFamily Family) { return Family.Stepping | (Family.Model << 4) | (Family.FamilyID << 8) | (Family.ProcessorType << 12) | (Family.ExtendedModel << 16) | (Family.ExtendedFamilyID << 20); } #ifdef CPUID_AMD constexpr uint32_t FAMILY_IDENTIFIER = GenerateFamily(CPUFamily { .Stepping = 0, .Model = 0xA, .ExtendedModel = 0, .FamilyID = 0xF, .ExtendedFamilyID = 1, .ProcessorType = 0, }); #else constexpr uint32_t FAMILY_IDENTIFIER = GenerateFamily(CPUFamily { .Stepping = 1, .Model = 6, .ExtendedModel = 0xA, .FamilyID = 6, .ExtendedFamilyID = 0, .ProcessorType = 0, }); #endif #ifdef ARCHITECTURE_arm64 uint64_t GetCycleCounterFrequency() { uint64_t Result {}; __asm("mrs %[Res], CNTFRQ_EL0" : [Res] "=r"(Result)); return Result; } uint32_t GetCPUID_TPIDRRO() { uint64_t Result {}; __asm("mrs %[Res], TPIDRRO_EL0" : [Res] "=r"(Result)); return Result; } void CPUIDEmu::SetupHostHybridFlag() { FEX_CONFIG_OPT(HideHybrid, HIDEHYBRID); PerCPUData.resize(Cores); uint64_t MIDR {}; for (size_t i = 0; i < Cores; ++i) { auto NewMIDR = CTX->HostFeatures.CPUMIDRs[i]; if (MIDR != 0 && MIDR != NewMIDR) { // CPU mismatch, claim hybrid Hybrid = true; } // Truncate to 32-bits, top 32-bits are all reserved in MIDR PerCPUData[i].ProductName = ProductNames::ARM_UNKNOWN; PerCPUData[i].MIDR = NewMIDR; MIDR = NewMIDR; } if (HideHybrid()) { // Hide the hybrid flag. Hybrid = false; } struct CPUMIDR { uint8_t Implementer; uint16_t Part; bool DefaultBig; // Defaults to a big core const char* ProductName {}; }; // CPU priority order // This is mostly arbitrary but will sort by some sort of CPU priority by performance // Relative list so things they will commonly end up in big.little configurations sort of relate static constexpr std::array CPUMIDRs = {{ // Typically big CPU cores {0x51, 0x001, 1, ProductNames::ARM_ORYON_1}, // Qualcomm Oryon-1 {0x61, 0x039, 1, ProductNames::ARM_Avalanche_M2Max}, // Apple Avalanche (M2 Max) {0x61, 0x035, 1, ProductNames::ARM_Avalanche_M2Pro}, // Apple Avalanche (M2 Pro) {0x61, 0x033, 1, ProductNames::ARM_Avalanche_M2}, // Apple Avalanche (M2) {0x61, 0x029, 1, ProductNames::ARM_Firestorm_M1Max}, // Apple Firestorm (M1 Max) {0x61, 0x025, 1, ProductNames::ARM_Firestorm_M1Pro}, // Apple Firestorm (M1 Pro) {0x61, 0x023, 1, ProductNames::ARM_Firestorm_M1}, // Apple Firestorm (M1) {0x61, 0, 1, ProductNames::ARM_AppleSilicon}, // QEmu Apple Silicon {0x41, 0xd8c, 1, ProductNames::ARM_C1Ultra}, // C1-Ultra {0x41, 0xd90, 1, ProductNames::ARM_C1Premium}, // C1-Premium {0x41, 0xd8b, 1, ProductNames::ARM_C1Pro}, // C1-Pro {0x41, 0xd85, 1, ProductNames::ARM_X925}, // X925 {0x41, 0xd87, 1, ProductNames::ARM_A725}, // A725 {0x41, 0xd84, 1, ProductNames::ARM_V3}, // V3 {0x41, 0xd83, 1, ProductNames::ARM_V3AE}, // V3AE {0x41, 0xd8e, 1, ProductNames::ARM_N3}, // N3 {0x41, 0xd82, 1, ProductNames::ARM_X4}, // X4 {0x41, 0xd81, 1, ProductNames::ARM_A720}, // A720 {0x41, 0xd4e, 1, ProductNames::ARM_X3}, // X3 {0x41, 0xd4d, 1, ProductNames::ARM_A715}, // A715 {0x41, 0xd4f, 1, ProductNames::ARM_V2}, // V2 {0x41, 0xd4b, 1, ProductNames::ARM_A78C}, // A78C {0x41, 0xd4a, 1, ProductNames::ARM_E1}, // E1 {0x41, 0xd49, 1, ProductNames::ARM_N2}, // N2 {0x41, 0xd48, 1, ProductNames::ARM_X2}, // X2 {0x41, 0xd47, 1, ProductNames::ARM_A710}, // A710 {0x41, 0xd4C, 1, ProductNames::ARM_X1C}, // X1C {0x41, 0xd44, 1, ProductNames::ARM_X1}, // X1 {0x41, 0xd42, 1, ProductNames::ARM_A78AE}, // A78AE {0x41, 0xd41, 1, ProductNames::ARM_A78}, // A78 {0x41, 0xd40, 1, ProductNames::ARM_V1}, // V1 {0x41, 0xd0e, 1, ProductNames::ARM_A76AE}, // A76AE {0x41, 0xd0d, 1, ProductNames::ARM_A77}, // A77 {0x41, 0xd0c, 1, ProductNames::ARM_N1}, // N1 {0x41, 0xd0b, 1, ProductNames::ARM_A76}, // A76 {0x51, 0x804, 1, ProductNames::ARM_Kryo400}, // Kryo 4xx Gold (A76 based) {0x41, 0xd0a, 1, ProductNames::ARM_A75}, // A75 {0x51, 0x802, 1, ProductNames::ARM_Kryo300}, // Kryo 3xx Gold (A75 based) {0x41, 0xd09, 1, ProductNames::ARM_A73}, // A73 {0x51, 0x800, 1, ProductNames::ARM_Kryo200}, // Kryo 2xx Gold (A73 based) {0x41, 0xd08, 1, ProductNames::ARM_A72}, // A72 {0xc0, 0xac3, 1, ProductNames::ARM_Ampere_1}, // AmpereOne {0xc0, 0xac4, 1, ProductNames::ARM_Ampere_1A}, // AmpereOneA {0xc0, 0xac5, 1, ProductNames::ARM_Ampere_1B}, // AmpereOneB {0xc0, 0xac7, 1, ProductNames::ARM_Ampere_1C}, // AmpereOneC {0x4e, 0x010, 1, ProductNames::ARM_Olympus}, // Olympus {0x4e, 0x004, 1, ProductNames::ARM_Carmel}, // Carmel // Denver rated above A57 to match TX2 weirdness {0x4e, 0x003, 1, ProductNames::ARM_Denver}, // Denver {0x41, 0xd07, 1, ProductNames::ARM_A57}, // A57 // Typically Little CPU cores {0x61, 0x038, 0, ProductNames::ARM_Blizzard_M2Max}, // Apple Blizzard (M2 Max) {0x61, 0x034, 0, ProductNames::ARM_Blizzard_M2Pro}, // Apple Blizzard (M2 Pro) {0x61, 0x032, 0, ProductNames::ARM_Blizzard_M2}, // Apple Blizzard (M2) {0x61, 0x028, 0, ProductNames::ARM_Icestorm_M1Max}, // Apple Icestorm (M1 Max) {0x61, 0x024, 0, ProductNames::ARM_Icestorm_M1Pro}, // Apple Icestorm (M1 Pro) {0x61, 0x022, 0, ProductNames::ARM_Icestorm_M1}, // Apple Icestorm (M1) {0x41, 0xd8a, 1, ProductNames::ARM_C1Nano}, // C1-Nano {0x41, 0xd80, 0, ProductNames::ARM_A520}, // A520 {0x41, 0xd46, 0, ProductNames::ARM_A510}, // A510 {0x41, 0xd06, 0, ProductNames::ARM_A65}, // A65 {0x41, 0xd05, 0, ProductNames::ARM_A55}, // A55 {0x51, 0x805, 0, ProductNames::ARM_Kryo400S}, // Kryo 4xx/5xx Silver (A55 based) {0x51, 0x803, 0, ProductNames::ARM_Kryo300S}, // Kryo 3xx Silver (A55 based) {0x41, 0xd03, 0, ProductNames::ARM_A53}, // A53 {0x51, 0x801, 0, ProductNames::ARM_Kryo200S}, // Kryo 2xx Silver (A53 based) {0x41, 0xd04, 0, ProductNames::ARM_A35}, // A35 {0x41, 0, 0, ProductNames::ARM_UNKNOWN}, // Invalid CPU or Apple CPU inside Parallels VM {0x0, 0, 0, ProductNames::ARM_UNKNOWN}, // Invalid starting point is lowest ranked }}; auto FindDefinedMIDR = [](uint32_t MIDR) -> const CPUMIDR* { uint8_t Implementer = MIDR >> 24; uint16_t Part = (MIDR >> 4) & 0xFFF; for (auto& MIDROption : CPUMIDRs) { if (MIDROption.Implementer == Implementer && MIDROption.Part == Part) { return &MIDROption; } } return nullptr; }; if (Hybrid) { // Walk the MIDRs and calculate big little designs fextl::vector BigCores; fextl::vector LittleCores; // Separate CPU cores out to big or little selected for (size_t i = 0; i < Cores; ++i) { uint32_t MIDR = PerCPUData[i].MIDR; auto MIDROption = FindDefinedMIDR(MIDR); if (MIDROption) { // Found one if (MIDROption->DefaultBig) { BigCores.emplace_back(MIDROption); } else { LittleCores.emplace_back(MIDROption); } } else { // If we didn't insert this MIDR then claim it is a little core. LittleCores.emplace_back(&CPUMIDRs.back()); } } if (LittleCores.empty()) { // If we only ended up with big cores then we need to move some to be little cores uint32_t LowestMIDR = ~0U; uint32_t LowestMIDRIdx = 0; // Walk all the big cores for (size_t i = 0; i < BigCores.size(); ++i) { uint8_t Implementer = BigCores[i]->Implementer; uint16_t Part = BigCores[i]->Part; // Walk our list of CPUMIDRs to find the most little core for (size_t j = LowestMIDRIdx; j < CPUMIDRs.size(); ++j) { auto& MIDROption = CPUMIDRs[i]; if ((MIDROption.Implementer == Implementer && MIDROption.Part == Part) || (MIDROption.Implementer == 0 && MIDROption.Part == 0)) { LowestMIDRIdx = j; LowestMIDR = MIDR; break; } } } // Now we WILL have found a big core to demote to little status // Demote them std::erase_if(BigCores, [&LittleCores, LowestMIDR](auto* Entry) { // Demote by erase copy to little array uint8_t Implementer = LowestMIDR >> 24; uint16_t Part = (LowestMIDR >> 4) & 0xFFF; if (Entry->Implementer == Implementer && Entry->Part == Part) { // Add it to the BigCore list LittleCores.emplace_back(Entry); return true; } return false; }); } if (BigCores.empty()) { // We never found a CPU core we understand // Grab the first core, consider it as little, move everything else to Big uint32_t LittleMIDR = PerCPUData[0].MIDR; // Now walk the little cores and move them to Big if they don't match std::erase_if(LittleCores, [&BigCores, LittleMIDR](auto* Entry) { // You're promoted now uint8_t Implementer = LittleMIDR >> 24; uint16_t Part = (LittleMIDR >> 4) & 0xFFF; if (Entry->Implementer != Implementer || Entry->Part != Part) { // Add it to the BigCore list BigCores.emplace_back(Entry); return true; } return false; }); } // Now walk the per CPU data one more time and set if it is big or little for (auto& Data : PerCPUData) { uint8_t Implementer = Data.MIDR >> 24; uint16_t Part = (Data.MIDR >> 4) & 0xFFF; bool FoundBig {}; const CPUMIDR* MIDR {}; for (auto Big : BigCores) { if (Big->Implementer == Implementer && Big->Part == Part) { FoundBig = true; MIDR = Big; break; } } if (!FoundBig) { for (auto Little : LittleCores) { if (Little->Implementer == Implementer && Little->Part == Part) { MIDR = Little; break; } } } Data.IsBig = FoundBig; if (MIDR) { Data.ProductName = MIDR->ProductName ?: ProductNames::ARM_UNKNOWN; } else { Data.ProductName = ProductNames::ARM_UNKNOWN; } } } else { // If we aren't hybrid then just claim everything is big for (size_t i = 0; i < Cores; ++i) { const auto MIDRIndex = HideHybrid() ? 0 : i; uint32_t MIDR = PerCPUData[MIDRIndex].MIDR; auto MIDROption = FindDefinedMIDR(MIDR); PerCPUData[i].IsBig = true; if (MIDROption) { PerCPUData[i].ProductName = MIDROption->ProductName ?: ProductNames::ARM_UNKNOWN; } else { PerCPUData[i].ProductName = ProductNames::ARM_UNKNOWN; } } } } #else uint64_t GetCycleCounterFrequency() { return 0; } void CPUIDEmu::SetupHostHybridFlag() {} #endif void CPUIDEmu::SetupFeatures() { if (CTX->HostFeatures.SupportsAVX) { XCR0 |= XCR0_AVX; } Features.SHA = CTX->HostFeatures.SupportsSHA; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_0h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // EBX, EDX, ECX become the manufacturer id string #ifdef CPUID_AMD Res.eax = 0x0D; // Let's say we are a Zen+ Res.ebx = CPUID_VENDOR_AMD1; Res.edx = CPUID_VENDOR_AMD2; Res.ecx = CPUID_VENDOR_AMD3; #else Res.eax = 0x16; // Let's say we are a Skylake Res.ebx = CPUID_VENDOR_INTEL1; Res.edx = CPUID_VENDOR_INTEL2; Res.ecx = CPUID_VENDOR_INTEL3; #endif return Res; } // Processor Info and Features bits FEXCore::CPUID::FunctionResults CPUIDEmu::Function_01h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // Hypervisor bit is normally set but some applications have issues with it. uint32_t Hypervisor = HideHypervisorBit() ? 0 : 1; Res.eax = FAMILY_IDENTIFIER; Res.ebx = 0 | // Brand index (8 << 8) | // Cache line size in bytes (Cores << 16) | // Number of addressable IDs for the logical cores in the physical CPU (GetCPUID() << 24); // Local APIC ID Res.ecx = (1 << 0) | // SSE3 (CTX->HostFeatures.SupportsPMULL_128Bit << 1) | // PCLMULQDQ (1 << 2) | // DS area supports 64bit layout (1 << 3) | // MWait (0 << 4) | // DS-CPL (0 << 5) | // VMX (0 << 6) | // SMX (0 << 7) | // Intel SpeedStep (1 << 8) | // Thermal Monitor 2 (1 << 9) | // SSSE3 (0 << 10) | // L1 context ID (0 << 11) | // Silicon debug (SupportsAVX() << 12) | // FMA3 (1 << 13) | // CMPXCHG16B (0 << 14) | // xTPR update control (0 << 15) | // Perfmon and debug capability (0 << 16) | // Reserved (0 << 17) | // Process-context identifiers (0 << 18) | // Prefetching from memory mapped device (1 << 19) | // SSE4.1 (CTX->HostFeatures.SupportsCRC << 20) | // SSE4.2 (0 << 21) | // X2APIC (1 << 22) | // MOVBE (1 << 23) | // POPCNT (0 << 24) | // APIC TSC-Deadline (CTX->HostFeatures.SupportsAES << 25) | // AES (SupportsAVX() << 26) | // XSAVE (SupportsAVX() << 27) | // OSXSAVE (SupportsAVX() << 28) | // AVX (SupportsAVX() << 29) | // F16C (CTX->HostFeatures.SupportsRAND << 30) | // RDRAND (Hypervisor << 31); Res.edx = (1 << 0) | // FPU (1 << 1) | // Virtual 8086 mode enhancements (0 << 2) | // Debugging extensions (0 << 3) | // Page size extension (1 << 4) | // RDTSC supported (1 << 5) | // MSR supported (1 << 6) | // PAE (1 << 7) | // Machine Check exception (1 << 8) | // CMPXCHG8B (1 << 9) | // APIC on-chip (0 << 10) | // Reserved (1 << 11) | // SYSENTER/SYSEXIT (1 << 12) | // Memory Type Range registers, MTRRs are supported (1 << 13) | // Page Global bit (1 << 14) | // Machine Check architecture (1 << 15) | // CMOV (1 << 16) | // Page Attribute Table (1 << 17) | // 36bit page size extension (0 << 18) | // Processor serial number (1 << 19) | // CLFLUSH (0 << 20) | // Reserved (0 << 21) | // Debug store (0 << 22) | // Thermal monitor and software controled clock (1 << 23) | // MMX (1 << 24) | // FXSAVE/FXRSTOR (1 << 25) | // SSE (1 << 26) | // SSE2 (0 << 27) | // Self Snoop (0 << 28) | // (HTT) Max APIC IDs reserved field is valid (1 << 29) | // Thermal monitor (0 << 30) | // Reserved (0 << 31); // Pending break enable return Res; } // 2: Cache and TLB information FEXCore::CPUID::FunctionResults CPUIDEmu::Function_02h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // returns default values from i7 model 1Ah Res.eax = 0x1 | // Number of iterations needed for all descriptors (0x5A << 8) | (0x03 << 16) | (0x55 << 24); Res.ebx = 0xE4 | (0xB2 << 8) | (0xF0 << 16) | (0 << 24); Res.ecx = 0; // null descriptors Res.edx = 0x2C | (0x21 << 8) | (0xCA << 16) | (0x09 << 24); return Res; } // 4: Deterministic cache parameters for each level FEXCore::CPUID::FunctionResults CPUIDEmu::Function_04h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; constexpr uint32_t CacheType_Data = 1; constexpr uint32_t CacheType_Instruction = 2; constexpr uint32_t CacheType_Unified = 3; if (Leaf == 0) { // Report L1D uint32_t CoreCount = Cores - 1; Res.eax = CacheType_Data | // Cache type (0b001 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (0 << 14) | // Maximum number of addressable IDs for logical processors sharing this cache (With SMT this would be 1) (CoreCount << 26); // Maximum number of addressable IDs for processor cores in the physical package Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 32KB Res.ecx = 63; // Number of sets - 1 : Claiming 64 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1) | // Cache inclusiveness - Includes lower caches (0 << 2); // Complex cache indexing - 0: Direct, 1: Complex } else if (Leaf == 1) { // Report L1I uint32_t CoreCount = Cores - 1; Res.eax = CacheType_Instruction | // Cache type (0b001 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (0 << 14) | // Maximum number of addressable IDs for logical processors sharing this cache (With SMT this would be 1) (CoreCount << 26); // Maximum number of addressable IDs for processor cores in the physical package Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 32KB Res.ecx = 63; // Number of sets - 1 : Claiming 64 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1) | // Cache inclusiveness - Includes lower caches (0 << 2); // Complex cache indexing - 0: Direct, 1: Complex } else if (Leaf == 2) { // Report L2 uint32_t CoreCount = Cores - 1; Res.eax = CacheType_Unified | // Cache type (0b010 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (0 << 14) | // Maximum number of addressable IDs for logical processors sharing this cache (CoreCount << 26); // Maximum number of addressable IDs for processor cores in the physical package Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 512KB Res.ecx = 0x3FF; // Number of sets - 1 : Claiming 1024 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1) | // Cache inclusiveness - Includes lower caches (0 << 2); // Complex cache indexing - 0: Direct, 1: Complex } else if (Leaf == 3) { // Report L3 uint32_t CoreCount = Cores - 1; Res.eax = CacheType_Unified | // Cache type (0b011 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (CoreCount << 14) | // Maximum number of addressable IDs for logical processors sharing this cache (CoreCount << 26); // Maximum number of addressable IDs for processor cores in the physical package Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 8MB Res.ecx = 0x4000; // Number of sets - 1 : Claiming 16384 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1) | // Cache inclusiveness - Includes lower caches (1 << 2); // Complex cache indexing - 0: Direct, 1: Complex } return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_06h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; Res.eax = (1 << 2); // Always running APIC Res.ecx = (0 << 3); // Intel performance energy bias preference (EPB) return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_07h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; if (Leaf == 0) { // Disable Enhanced REP MOVS when TSO is enabled. // vcruntime140 memmove will use `rep movsb` in this case which completely destroys perf in Hades(appId 1145360) // This is due to LRCPC performance on Cortex being abysmal. // Only enable EnhancedREPMOVS if atomic memcpy tso emulation isn't enabled. const uint32_t SupportsEnhancedREPMOVS = CTX->IsMemcpyAtomicTSOEnabled() == false; const uint32_t SupportsVPCLMULQDQ = CTX->HostFeatures.SupportsPMULL_128Bit && SupportsAVX(); const uint32_t SupportsWFXT = CTX->HostFeatures.SupportsWFXT; // Number of subfunctions Res.eax = 0x0; Res.ebx = (1 << 0) | // FS/GS support (0 << 1) | // TSC adjust MSR (0 << 2) | // SGX (SupportsAVX() << 3) | // BMI1 (0 << 4) | // Intel Hardware Lock Elison (SupportsAVX() << 5) | // AVX2 support (1 << 6) | // FPU data pointer updated only on exception (1 << 7) | // SMEP support (SupportsAVX() << 8) | // BMI2 (SupportsEnhancedREPMOVS << 9) | // Enhanced REP MOVSB/STOSB (1 << 10) | // INVPCID for system software control of process-context (0 << 11) | // Restricted transactional memory (0 << 12) | // Intel resource directory technology Monitoring (1 << 13) | // Deprecates FPU CS and DS (0 << 14) | // Intel MPX (0 << 15) | // Intel Resource Directory Technology Allocation (0 << 16) | // AVX512-F (0 << 17) | // AVX512-DQ (CTX->HostFeatures.SupportsRAND << 18) | // RDSEED (1 << 19) | // ADCX and ADOX instructions (0 << 20) | // SMAP Supervisor mode access prevention and CLAC/STAC instructions (0 << 21) | // AVX512-IFMA (0 << 22) | // PCOMMIT (deprecated?) (1 << 23) | // CLFLUSHOPT instruction (1 << 24) | // CLWB instruction (0 << 25) | // Intel processor trace (0 << 26) | // AVX512-PF (0 << 27) | // AVX512-ER (0 << 28) | // AVX512-CD (Features.SHA << 29) | // SHA instructions (0 << 30) | // AVX512-BW (0 << 31); // AVX512-VL Res.ecx = (1 << 0) | // PREFETCHWT1 (0 << 1) | // AVX512VBMI (0 << 2) | // Usermode instruction prevention (0 << 3) | // Protection keys for user mode pages (0 << 4) | // OS protection keys (SupportsWFXT << 5) | // waitpkg (0 << 6) | // AVX512-VBMI2 (0 << 7) | // CET shadow stack (0 << 8) | // GFNI (CTX->HostFeatures.SupportsAES256 << 9) | // VAES (SupportsVPCLMULQDQ << 10) | // VPCLMULQDQ (0 << 11) | // AVX512-VNNI (0 << 12) | // AVX512-BITALG (0 << 13) | // Intel Total Memory Encryption (0 << 14) | // AVX512-VPOPCNTDQ (0 << 15) | // FZM (TDX) (0 << 16) | // 5 Level page tables (0 << 17) | // MPX MAWAU (0 << 18) | // MPX MAWAU (0 << 19) | // MPX MAWAU (0 << 20) | // MPX MAWAU (0 << 21) | // MPX MAWAU (1 << 22) | // RDPID Read Processor ID (0 << 23) | // AES Key Locker (1 << 24) | // bus-lock-detect (0 << 25) | // CLDEMOTE (0 << 26) | // MPRR (TDX) (0 << 27) | // MOVDIRI (0 << 28) | // MOVDIR64B (0 << 29) | // ENQCMD (0 << 30) | // SGX Launch configuration (0 << 31); // PKS Res.edx = (0 << 0) | // SGX-TEM (TDX) (0 << 1) | // SGX-KEYS (0 << 2) | // AVX512-4VNNIW (0 << 3) | // AVX512-4FMAPS (1 << 4) | // Fast Short Rep Mov (0 << 5) | // UINTR (0 << 6) | // Reserved (0 << 7) | // Reserved (0 << 8) | // AVX512-VP2INTERSECT (0 << 9) | // SRBDS_CTRL (Special Register Buffer Data Sampling Mitigations) (0 << 10) | // VERW clears CPU buffers (0 << 11) | // rtm-always-abort (0 << 12) | // Reserved (0 << 13) | // TSX Force Abort (TSX will force abort if attempted) (0 << 14) | // SERIALIZE instruction ((Hybrid ? 1U : 0U) << 15) | // Hybrid (0 << 16) | // TSXLDTRK (TSX Suspend load address tracking) - Allows untracked memory loads inside TSX region (0 << 17) | // Reserved (0 << 18) | // Intel PCONFIG (0 << 19) | // Intel Architectural LBR (0 << 20) | // Intel CET (0 << 21) | // Reserved (0 << 22) | // AMX-BF16 - Tile computation on bfloat16 (0 << 23) | // AVX512-FP16 - FP16 AVX512 instructions (0 << 24) | // AMX-tile - If AMX is implemented (0 << 25) | // AMX-int8 - AMX on 8-bit integers (0 << 26) | // IBRS_IBPB - Speculation control (0 << 27) | // STIBP - Single Thread Indirect Branch Predictor, Part of IBC (0 << 28) | // L1D Flush (0 << 29) | // Arch capabilities - Speculative side channel mitigations (0 << 30) | // Arch capabilities - MSR module specific (0 << 31); // SSBD - Speculative Store Bypass Disable } return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_0Dh(uint32_t Leaf) const { // Leaf 0 FEXCore::CPUID::FunctionResults Res {}; uint32_t XFeatureSupportedSizeMax = SupportsAVX() ? 0x0000'0340 : 0x0000'0240; // XFeatureEnabledSizeMax: Legacy Header + FPU/SSE + AVX if (Leaf == 0) { // XFeatureSupportedMask[31:0] Res.eax = (1 << 0) | // X87 support (1 << 1) | // 128-bit SSE support (SupportsAVX() << 2) | // 256-bit AVX support (0b00 << 3) | // MPX State (0b000 << 5) | // AVX-512 state (0 << 8) | // "Used for IA32_XSS" ... Used for what? (0 << 9); // PKRU state // EBX and ECX doesn't need to match if a feature is supported but not enabled Res.ebx = XFeatureSupportedSizeMax; Res.ecx = XFeatureSupportedSizeMax; // XFeatureSupportedSizeMax: Size in bytes of XSAVE/XRSTOR area // XFeatureSupportedMask[63:32] Res.edx = 0; // Upper 32-bits of XFeatureSupportedMask } else if (Leaf == 1) { Res.eax = (1 << 0) | // XSAVEOPT (0 << 1) | // XSAVEC (and XRSTOR) (0 << 2) | // XGETBV - XGETBV with ECX=1 supported (0 << 3); // XSAVES - XSAVES, XRSTORS, and IA32_XSS supported // Same information as Leaf 0 for ebx Res.ebx = XFeatureSupportedSizeMax; // Lower supported 32bits of IA32_XSS MSR. IA32_XSS[n] can only be set to 1 if ECX[n] is 1 Res.ecx = (0b0000'0000 << 0) | // Used for XCR0 (0 << 8) | // PT state (0 << 9); // Used for XCR0 // Upper supported 32bits of IA32_XSS MSR. IA32_XSS[n+32] can only be set to 1 if EDX[n] is 1 // Entirely reserved atm Res.edx = 0; } else if (Leaf == 2) { Res.eax = SupportsAVX() ? 0x0000'0100 : 0; // YmmSaveStateSize Res.ebx = SupportsAVX() ? 0x0000'0240 : 0; // YmmSaveStateOffset // Reserved Res.ecx = 0; Res.edx = 0; } return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_15h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // TSC frequency = ECX * EBX / EAX uint64_t FrequencyHz = GetCycleCounterFrequency(); if (FrequencyHz) { Res.eax = 1; Res.ebx = 1U << CTX->Config.TSCScale; Res.ecx = FrequencyHz; } return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_1Ah(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; if (Hybrid) { uint32_t CPU = GetCPUID(); auto& Data = PerCPUData[CPU]; // 0x40 is a big CPU // 0x20 is a little CPU Res.eax |= (Data.IsBig ? 0x40 : 0x20) << 24; } return Res; } // Hypervisor CPUID information leaf FEXCore::CPUID::FunctionResults CPUIDEmu::Function_4000_0000h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // Maximum supported hypervisor leafs // We only expose the information leaf // // Common courtesy to follow VMWare's "Hypervisor CPUID Interface proposal" // 4000_0000h - Information leaf. Advertising to the software which hypervisor this is // 4000_0001h - 4000_000Fh - Hypervisor specific leafs. FEX can use these for anything // 4000_0010h - 4000_00FFh - "Generic Leafs" - Try not to overwrite, other hypervisors might expect information in these // // CPUID documentation information: // 4000_0000h - 4FFF_FFFFh - No existing or future CPU will return information in this range // Reserved entirely for VMs to do whatever they want. Res.eax = 0x40000001; // EBX, EDX, ECX become the hypervisor ID signature constexpr static char HypervisorID[12] = "FEXIFEXIEMU"; memcpy(&Res.ebx, HypervisorID, sizeof(HypervisorID)); return Res; } constexpr std::array::length(GIT_DESCRIBE_STRING) + 1> GitString = {GIT_DESCRIBE_STRING}; static_assert(GitString.size() < 32); // Hypervisor CPUID information leaf FEXCore::CPUID::FunctionResults CPUIDEmu::Function_4000_0001h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; constexpr uint32_t MaximumSubLeafNumber = 2; if (Leaf == 0) { // EAX[3:0] Is the host architecture that FEX is running under #ifdef ARCHITECTURE_x86_64 // EAX[3:0] = 1 = x86_64 host architecture Res.eax |= 0b0001; #elif defined(ARCHITECTURE_arm64) // EAX[3:0] = 2 = AArch64 host architecture Res.eax |= 0b0010; #else // EAX[3:0] = 0 = Unknown architecture #endif // EAX[15:4] = Reserved // EAX[31:16] = Maximum sub-leaf value. Res.eax |= MaximumSubLeafNumber << 16; } else if (Leaf == 1) { memcpy(&Res, GitString.data(), std::min(GitString.size(), sizeof(FEXCore::CPUID::FunctionResults))); } else if (Leaf == 2) { memcpy(&Res, GitString.data() + 16, std::min(std::max(0, GitString.size() - 16), sizeof(FEXCore::CPUID::FunctionResults))); } return Res; } // Highest extended function implemented FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0000h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; Res.eax = 0x8000001F; // EBX, EDX, ECX become the manufacturer id string // Just like cpuid function 0 #ifdef CPUID_AMD Res.ebx = CPUID_VENDOR_AMD1; Res.edx = CPUID_VENDOR_AMD2; Res.ecx = CPUID_VENDOR_AMD3; #else Res.ebx = CPUID_VENDOR_INTEL1; Res.edx = CPUID_VENDOR_INTEL2; Res.ecx = CPUID_VENDOR_INTEL3; #endif return Res; } // Extended processor and feature bits FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0001h(uint32_t Leaf) const { #ifndef _WIN32 constexpr uint32_t SUPPORTS_RDTSCP = 1; #else // RDTSCP under WIN32 is only supported if CPUIndex is available in TPIDRRO. const uint32_t SUPPORTS_RDTSCP = SupportsCPUIndexInTPIDRRO; #endif FEXCore::CPUID::FunctionResults Res {}; Res.eax = FAMILY_IDENTIFIER; Res.ecx = (1 << 0) | // LAHF/SAHF (1 << 1) | // 0 = Single core product, 1 = multi core product (0 << 2) | // SVM (1 << 3) | // Extended APIC register space (0 << 4) | // LOCK MOV CR0 means MOV CR8 (1 << 5) | // ABM instructions (CTX->HostFeatures.SupportsSSE4a << 6) | // SSE4a (0 << 7) | // Misaligned SSE mode (1 << 8) | // PREFETCHW (0 << 9) | // OS visible workaround support (0 << 10) | // Instruction based sampling support (0 << 11) | // XOP (0 << 12) | // SKINIT (0 << 13) | // Watchdog timer support (0 << 14) | // Reserved (0 << 15) | // Lightweight profiling support (0 << 16) | // FMA4 (1 << 17) | // Translation cache extension (0 << 18) | // Reserved (0 << 19) | // Reserved (0 << 20) | // Reserved (0 << 21) | // XOP-TBM (0 << 22) | // Topology extensions support (0 << 23) | // Core performance counter extensions (0 << 24) | // NB performance counter extensions (0 << 25) | // Reserved (0 << 26) | // Data breakpoints extensions (0 << 27) | // Performance TSC (0 << 28) | // L2 perf counter extensions (0 << 29) | // MONITORX (0 << 30) | // Reserved (0 << 31); // Reserved Res.edx = (1 << 0) | // FPU (1 << 1) | // Virtual mode extensions (1 << 2) | // Debugging extensions (1 << 3) | // Page size extensions (1 << 4) | // TSC (1 << 5) | // MSR support (1 << 6) | // PAE (1 << 7) | // Machine Check Exception (1 << 8) | // CMPXCHG8B (1 << 9) | // APIC (0 << 10) | // Reserved (1 << 11) | // SYSCALL/SYSRET (1 << 12) | // MTRR (1 << 13) | // Page global extension (1 << 14) | // Machine Check architecture (1 << 15) | // CMOV (1 << 16) | // Page attribute table (1 << 17) | // Page-size extensions (0 << 18) | // Reserved (0 << 19) | // Reserved (1 << 20) | // NX (0 << 21) | // Reserved (1 << 22) | // MMXExt (1 << 23) | // MMX (1 << 24) | // FXSAVE/FXRSTOR (1 << 25) | // FXSAVE/FXRSTOR Optimizations (0 << 26) | // 1 gigabit pages (SUPPORTS_RDTSCP << 27) | // RDTSCP (0 << 28) | // Reserved (1 << 29) | // Long Mode (CTX->HostFeatures.Supports3DNow << 30) | // 3DNow! Extensions (CTX->HostFeatures.Supports3DNow << 31); // 3DNow! return Res; } // Processor brand string FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0002h(uint32_t Leaf) const { return Function_8000_0002h(Leaf, GetCPUID()); } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0003h(uint32_t Leaf) const { return Function_8000_0003h(Leaf, GetCPUID()); } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0004h(uint32_t Leaf) const { return Function_8000_0004h(Leaf, GetCPUID()); } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0002h(uint32_t Leaf, uint32_t CPU) const { FEXCore::CPUID::FunctionResults Res {}; auto& Data = PerCPUData[CPU]; memcpy(&Res, Data.ProductName, std::min(strlen(Data.ProductName), sizeof(FEXCore::CPUID::FunctionResults))); return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0003h(uint32_t Leaf, uint32_t CPU) const { FEXCore::CPUID::FunctionResults Res {}; auto& Data = PerCPUData[CPU]; const auto RemainingStringSize = std::max(0, strlen(Data.ProductName) - 16); memcpy(&Res, Data.ProductName + 16, std::min(RemainingStringSize, sizeof(FEXCore::CPUID::FunctionResults))); return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0004h(uint32_t Leaf, uint32_t CPU) const { FEXCore::CPUID::FunctionResults Res {}; auto& Data = PerCPUData[CPU]; const auto RemainingStringSize = std::max(0, strlen(Data.ProductName) - 32); memcpy(&Res, Data.ProductName + 32, std::min(RemainingStringSize, sizeof(FEXCore::CPUID::FunctionResults))); return Res; } // L1 Cache and TLB identifiers FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0005h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // L1 TLB Information for 2MB and 4MB pages Res.eax = (64 << 0) | // Number of TLB instruction entries (255 << 8) | // instruction TLB associativity type (full) (64 << 16) | // Number of TLB data entries (255 << 24); // data TLB associativity type (full) // L1 TLB Information for 4KB pages Res.ebx = (64 << 0) | // Number of TLB instruction entries (255 << 8) | // instruction TLB associativity type (full) (64 << 16) | // Number of TLB data entries (255 << 24); // data TLB associativity type (full) // L1 data cache identifiers Res.ecx = (64 << 0) | // L1 data cache size line in bytes (1 << 8) | // L1 data cachelines per tag (8 << 16) | // L1 data cache associativity (32 << 24); // L1 data cache size in KB // L1 instruction cache identifiers Res.edx = (64 << 0) | // L1 instruction cache line size in bytes (1 << 8) | // L1 instruction cachelines per tag (4 << 16) | // L1 instruction cache associativity (64 << 24); // L1 instruction cache size in KB return Res; } // L2 Cache identifiers FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0006h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; // L2 TLB Information for 2MB and 4MB pages Res.eax = (1024 << 0) | // Number of TLB instruction entries (6 << 12) | // instruction TLB associativity type (1536 << 16) | // Number of TLB data entries (3 << 28); // data TLB associativity type // L2 TLB Information for 4KB pages Res.ebx = (1024 << 0) | // Number of TLB instruction entries (6 << 12) | // instruction TLB associativity type (1536 << 16) | // Number of TLB data entries (5 << 28); // data TLB associativity type // L2 cache identifiers Res.ecx = (64 << 0) | // cacheline size (1 << 8) | // cachelines per tag (6 << 12) | // cache associativity (512 << 16); // L2 cache size in KB // L3 cache identifiers Res.edx = (64 << 0) | // cacheline size (1 << 8) | // cachelines per tag (6 << 12) | // cache associativity (16 << 18); // L2 cache size in KB return Res; } // Advanced power management FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0007h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; Res.eax = (1 << 2); // APIC timer not affected by p-state Res.edx = (1 << 8); // Invariant TSC return Res; } // Virtual and physical address sizes FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0008h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; Res.eax = (48 << 0) | // PhysAddrSize = 48-bit (48 << 8) | // LinAddrSize = 48-bit (0 << 16); // GuestPhysAddrSize == PhysAddrSize Res.ebx = (0 << 2) | // XSaveErPtr: Saving and restoring error pointers (0 << 1) | // IRPerf: Instructions retired count support (CTX->HostFeatures.SupportsCLZERO << 0); // CLZERO support uint32_t CoreCount = Cores - 1; Res.ecx = (0 << 16) | // PerfTscSize: Performance timestamp count size (std::bit_ceil(Cores) << 12) | // ApicIdSize: Number of bits in ApicID (CoreCount << 0); // Count count subtract one return Res; } // TLB 1GB page identifiers FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_0019h(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; Res.eax = (0xF << 28) | // L1 DTLB associativity for 1GB pages (64 << 16) | // L1 DTLB entry count for 1GB pages (0xF << 12) | // L1 ITLB associativity for 1GB pages (64 << 0); // L1 ITLB entry count for 1GB pages Res.ebx = (0 << 28) | // L2 DTLB associativity for 1GB pages (0 << 16) | // L2 DTLB entry count for 1GB pages (0 << 12) | // L2 ITLB associativity for 1GB pages (0 << 0); // L2 ITLB entry count for 1GB pages return Res; } // Deterministic cache parameters for each level FEXCore::CPUID::FunctionResults CPUIDEmu::Function_8000_001Dh(uint32_t Leaf) const { // This is nearly a copy of CPUID function 4h // There are some minor changes though FEXCore::CPUID::FunctionResults Res {}; constexpr uint32_t CacheType_Data = 1; constexpr uint32_t CacheType_Instruction = 2; constexpr uint32_t CacheType_Unified = 3; if (Leaf == 0) { // Report L1D Res.eax = CacheType_Data | // Cache type (0b001 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (0 << 14); // Maximum number of addressable IDs for logical processors sharing this cache (With SMT this would be 1) Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 32KB Res.ecx = 63; // Number of sets - 1 : Claiming 64 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1); // Cache inclusiveness - Includes lower caches } else if (Leaf == 1) { // Report L1I Res.eax = CacheType_Instruction | // Cache type (0b001 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (0 << 14); // Maximum number of addressable IDs for logical processors sharing this cache (With SMT this would be 1) Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 32KB Res.ecx = 63; // Number of sets - 1 : Claiming 64 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1); // Cache inclusiveness - Includes lower caches } else if (Leaf == 2) { // Report L2 Res.eax = CacheType_Unified | // Cache type (0b010 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (0 << 14); // Maximum number of addressable IDs for logical processors sharing this cache Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 512KB Res.ecx = 0x3FF; // Number of sets - 1 : Claiming 1024 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1); // Cache inclusiveness - Includes lower caches } else if (Leaf == 3) { // Report L3 uint32_t CoreCount = Cores - 1; Res.eax = CacheType_Unified | // Cache type (0b011 << 5) | // Cache level (1 << 8) | // Self initializing cache level (0 << 9) | // Fully associative (CoreCount << 14); // Maximum number of addressable IDs for logical processors sharing this cache Res.ebx = (63 << 0) | // Line Size - 1 : Claiming 64 byte (0 << 12) | // Physical Line partitions (7 << 22); // Associativity - 1 : Claiming 8 way // 8MB Res.ecx = 0x4000; // Number of sets - 1 : Claiming 16384 sets Res.edx = (0 << 0) | // Write-back invalidate (0 << 1); // Cache inclusiveness - Includes lower caches } return Res; } FEXCore::CPUID::FunctionResults CPUIDEmu::Function_Reserved(uint32_t Leaf) const { FEXCore::CPUID::FunctionResults Res {}; return Res; } FEXCore::CPUID::XCRResults CPUIDEmu::XCRFunction_0h() const { // This just returns XCR0 FEXCore::CPUID::XCRResults Res { .eax = static_cast(XCR0), .edx = static_cast(XCR0 >> 32), }; return Res; } CPUIDEmu::CPUIDEmu(const FEXCore::Context::ContextImpl* ctx) : CTX {ctx} , SupportsCPUIndexInTPIDRRO {CTX->HostFeatures.SupportsCPUIndexInTPIDRRO} , GetCPUID {GetCPUID_Syscall} { Cores = CTX->HostFeatures.CPUMIDRs.size(); // Setup some state tracking SetupHostHybridFlag(); SetupFeatures(); #ifdef ARCHITECTURE_arm64 if (SupportsCPUIndexInTPIDRRO) { GetCPUID = GetCPUID_TPIDRRO; } #endif } } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Core/CPUID.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include namespace FEXCore { namespace Context { class ContextImpl; } uint64_t GetCycleCounterFrequency(); // Debugging define to switch what family of CPU we execute as. // Might be useful if an application makes an assumption about a CPU. // #define CPUID_AMD class CPUIDEmu final { private: constexpr static uint32_t CPUID_VENDOR_INTEL1 = 0x756E6547; // "Genu" constexpr static uint32_t CPUID_VENDOR_INTEL2 = 0x49656E69; // "ineI" constexpr static uint32_t CPUID_VENDOR_INTEL3 = 0x6C65746E; // "ntel" constexpr static uint32_t CPUID_VENDOR_AMD1 = 0x68747541; // "Auth" constexpr static uint32_t CPUID_VENDOR_AMD2 = 0x69746E65; // "enti" constexpr static uint32_t CPUID_VENDOR_AMD3 = 0x444D4163; // "cAMD" public: CPUIDEmu(const FEXCore::Context::ContextImpl* ctx); // X86 cacheline size effectively has to be hardcoded to 64 // if we report anything differently then applications are likely to break constexpr static uint64_t CACHELINE_SIZE = 64; FEXCore::CPUID::FunctionResults RunFunction(uint32_t Function, uint32_t Leaf) const { if (Function < Primary.size()) { const auto Handler = Primary[Function]; return (this->*Handler)(Leaf); } constexpr uint32_t HypervisorBase = 0x4000'0000; if (Function >= HypervisorBase && Function < (HypervisorBase + Hypervisor.size())) { const auto Handler = Hypervisor[Function - HypervisorBase]; return (this->*Handler)(Leaf); } constexpr uint32_t ExtendedBase = 0x8000'0000; if (Function >= ExtendedBase && Function < (ExtendedBase + Extended.size())) { const auto Handler = Extended[Function - ExtendedBase]; return (this->*Handler)(Leaf); } return Function_Reserved(Leaf); } FEXCore::CPUID::FunctionResults RunFunctionName(uint32_t Function, uint32_t Leaf, uint32_t CPU) const { if (Function == 0x8000'0002U) { return Function_8000_0002h(Leaf, CPU % PerCPUData.size()); } else if (Function == 0x8000'0003U) { return Function_8000_0003h(Leaf, CPU % PerCPUData.size()); } else { return Function_8000_0004h(Leaf, CPU % PerCPUData.size()); } } FEXCore::CPUID::XCRResults RunXCRFunction(uint32_t Function) const { if (Function >= 1) { // XCR function 1 is not yet supported. return {}; } return XCRFunction_0h(); } bool DoesXCRFunctionReportConstantData(uint32_t Function) const { // Every function currently returns constant data. return true; } enum class SupportsConstant { CONSTANT, NONCONSTANT, }; enum class NeedsLeafConstant { NEEDSLEAFCONSTANT, NOLEAFCONSTANT, }; struct FunctionConstant { SupportsConstant SupportsConstantFunction; NeedsLeafConstant NeedsLeaf; }; static constexpr FunctionConstant DoesFunctionReportConstantData(uint32_t Function) { if (Function < Primary.size()) { return Primary_Constant[Function]; } constexpr uint32_t HypervisorBase = 0x4000'0000; if (Function >= HypervisorBase && Function < (HypervisorBase + Hypervisor.size())) { return Hypervisor_Constant[Function - HypervisorBase]; } constexpr uint32_t ExtendedBase = 0x8000'0000; if (Function >= ExtendedBase && Function < (ExtendedBase + Extended.size())) { return Extended_Constant[Function - ExtendedBase]; } // Anything unsupported is known constant return of reserved data. return {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}; } private: const FEXCore::Context::ContextImpl* CTX; [[maybe_unused]] bool SupportsCPUIndexInTPIDRRO {}; bool Hybrid {}; uint32_t Cores {}; FEX_CONFIG_OPT(HideHypervisorBit, HIDEHYPERVISORBIT); // XFEATURE_ENABLED_MASK // Mask that configures what features are enabled on the CPU. // Affects XSAVE and XRSTOR when modified. // Bit layout is as follows. // [0] - x87 enabled // [1] - SSE enabled // [2] - YMM enabled (256-bit SSE) // [8:3] - Reserved. MBZ. // [9] - MPK // [10] - Reserved. MBZ. // [11] - CET_U // [12] - CET_S // [61:13] - Reserved. MBZ. // [62] - LWP (Lightweight profiling) // [63] - Reserved for XCR bit vector expansion. MBZ. // Always enable x87 and SSE by default. constexpr static uint64_t XCR0_X87 = 1ULL << 0; constexpr static uint64_t XCR0_SSE = 1ULL << 1; constexpr static uint64_t XCR0_AVX = 1ULL << 2; struct FeaturesConfig { uint64_t SHA : 1; uint64_t _pad : 63; }; FeaturesConfig Features { .SHA = 1, }; uint64_t XCR0 {XCR0_X87 | XCR0_SSE}; uint32_t SupportsAVX() const { return (XCR0 & XCR0_AVX) ? 1 : 0; } using FunctionHandler = FEXCore::CPUID::FunctionResults (CPUIDEmu::*)(uint32_t Leaf) const; struct CPUData { const char* ProductName {}; #ifdef ARCHITECTURE_arm64 uint32_t MIDR {}; #endif bool IsBig {}; }; fextl::vector PerCPUData {}; // Functions FEXCore::CPUID::FunctionResults Function_0h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_01h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_02h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_04h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_06h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_07h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_0Dh(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_15h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_1Ah(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_4000_0000h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_4000_0001h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0000h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0001h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0002h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0003h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0004h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0002h(uint32_t Leaf, uint32_t CPU) const; FEXCore::CPUID::FunctionResults Function_8000_0003h(uint32_t Leaf, uint32_t CPU) const; FEXCore::CPUID::FunctionResults Function_8000_0004h(uint32_t Leaf, uint32_t CPU) const; FEXCore::CPUID::FunctionResults Function_8000_0005h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0006h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0007h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0008h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_0019h(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_8000_001Dh(uint32_t Leaf) const; FEXCore::CPUID::FunctionResults Function_Reserved(uint32_t Leaf) const; FEXCore::CPUID::XCRResults XCRFunction_0h() const; void SetupHostHybridFlag(); void SetupFeatures(); static constexpr size_t PRIMARY_FUNCTION_COUNT = 27; static constexpr size_t HYPERVISOR_FUNCTION_COUNT = 2; static constexpr size_t EXTENDED_FUNCTION_COUNT = 32; static constexpr std::array Primary = { // 0: Highest function parameter and ID &CPUIDEmu::Function_0h, // 1: Processor info &CPUIDEmu::Function_01h, // 2: Cache and TLB info &CPUIDEmu::Function_02h, // 3: Serial Number(previously), now reserved &CPUIDEmu::Function_Reserved, #ifndef CPUID_AMD // 4: Deterministic cache parameters for each level &CPUIDEmu::Function_04h, #else &CPUIDEmu::Function_Reserved, #endif // 5: Monitor/mwait &CPUIDEmu::Function_Reserved, // 6: Thermal and power management &CPUIDEmu::Function_06h, // 7: Extended feature flags &CPUIDEmu::Function_07h, // 0x08: Reserved? &CPUIDEmu::Function_Reserved, // 9: Direct Cache Access information &CPUIDEmu::Function_Reserved, // 0x0A: Architectural performance monitoring &CPUIDEmu::Function_Reserved, // 0x0B: Extended topology enumeration &CPUIDEmu::Function_Reserved, // 0x0C: Reserved? &CPUIDEmu::Function_Reserved, // 0x0D: Processor extended state enumeration &CPUIDEmu::Function_0Dh, // 0x0E: Reserved? &CPUIDEmu::Function_Reserved, // 0x0F: Intel RDT monitoring &CPUIDEmu::Function_Reserved, // 0x10: Intel RDT allocation enumeration &CPUIDEmu::Function_Reserved, // 0x12: Reserved? &CPUIDEmu::Function_Reserved, // 0x12: Intel SGX capability enumeration &CPUIDEmu::Function_Reserved, // 0x13: Reserved &CPUIDEmu::Function_Reserved, // 0x14: Intel Processor trace &CPUIDEmu::Function_Reserved, #ifndef CPUID_AMD // Timestamp counter information // Doesn't exist on AMD hardware &CPUIDEmu::Function_15h, #else &CPUIDEmu::Function_Reserved, #endif // 0x16: Processor frequency information &CPUIDEmu::Function_Reserved, // 0x17: SoC vendor attribute enumeration &CPUIDEmu::Function_Reserved, // 0x18: Reserved? &CPUIDEmu::Function_Reserved, // 0x19: Reserved? &CPUIDEmu::Function_Reserved, #ifndef CPUID_AMD // 0x1A: Hybrid Information Sub-leaf &CPUIDEmu::Function_1Ah, #else &CPUIDEmu::Function_Reserved, #endif }; static constexpr std::array Primary_Constant = {{ // 0: Highest function parameter and ID {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 1: Processor info {SupportsConstant::NONCONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 2: Cache and TLB info {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 3: Serial Number(previously), now reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #ifndef CPUID_AMD // 4: Deterministic cache parameters for each level {SupportsConstant::CONSTANT, NeedsLeafConstant::NEEDSLEAFCONSTANT}, #else // 4: Reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #endif // 5: Monitor/mwait {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 6: Thermal and power management {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 7: Extended feature flags {SupportsConstant::CONSTANT, NeedsLeafConstant::NEEDSLEAFCONSTANT}, // 0x08: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 9: Direct Cache Access information {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x0A: Architectural performance monitoring {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x0B: Extended topology enumeration {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x0C: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x0D: Processor extended state enumeration {SupportsConstant::CONSTANT, NeedsLeafConstant::NEEDSLEAFCONSTANT}, // 0x0E: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x0F: Intel RDT monitoring {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x10: Intel RDT allocation enumeration {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x12: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x12: Intel SGX capability enumeration {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x13: Reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x14: Intel Processor trace {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #ifndef CPUID_AMD // 0x15: Timestamp counter information // Doesn't exist on AMD hardware {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #else // 0x15: Reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #endif // 0x16: Processor frequency information {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x17: SoC vendor attribute enumeration {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x18: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x19: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #ifndef CPUID_AMD // 0x1A: Hybrid Information Sub-leaf {SupportsConstant::NONCONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #else // 0x1A: Reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #endif }}; static constexpr std::array Hypervisor = { // Hypervisor CPUID information leaf &CPUIDEmu::Function_4000_0000h, // FEX-Emu specific leaf &CPUIDEmu::Function_4000_0001h, }; static constexpr std::array Hypervisor_Constant = {{ // Hypervisor CPUID information leaf {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // FEX-Emu specific leaf {SupportsConstant::CONSTANT, NeedsLeafConstant::NEEDSLEAFCONSTANT}, }}; static constexpr std::array Extended = { // Largest extended function number &CPUIDEmu::Function_8000_0000h, // Processor vendor &CPUIDEmu::Function_8000_0001h, // Processor brand string &CPUIDEmu::Function_8000_0002h, // Processor brand string continued &CPUIDEmu::Function_8000_0003h, // Processor brand string continued &CPUIDEmu::Function_8000_0004h, #ifdef CPUID_AMD // 0x8000'0005: L1 Cache and TLB identifiers &CPUIDEmu::Function_8000_0005h, #else &CPUIDEmu::Function_Reserved, #endif // 0x8000'0006: L2 Cache identifiers &CPUIDEmu::Function_8000_0006h, // 0x8000'0007: Advanced power management information &CPUIDEmu::Function_8000_0007h, // 0x8000'0008: Virtual and physical address sizes &CPUIDEmu::Function_8000_0008h, // 0x8000'0009: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'000A: SVM Revision &CPUIDEmu::Function_Reserved, // 0x8000'000B: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'000C: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'000D: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'000E: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'000F: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0010: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0011: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0012: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0013: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0014: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0015: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0016: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0017: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0018: Reserved? &CPUIDEmu::Function_Reserved, // 0x8000'0019: TLB 1GB page identifiers &CPUIDEmu::Function_8000_0019h, // 0x8000'001A: Performance optimization identifiers &CPUIDEmu::Function_Reserved, // 0x8000'001B: Instruction based sampling identifiers &CPUIDEmu::Function_Reserved, // 0x8000'001C: Lightweight profiling capabilities &CPUIDEmu::Function_Reserved, #ifdef CPUID_AMD // 0x8000'001D: Cache properties &CPUIDEmu::Function_8000_001Dh, #else &CPUIDEmu::Function_Reserved, #endif // 0x8000'001E: Extended APIC ID &CPUIDEmu::Function_Reserved, // 0x8000'001F: AMD Secure Encryption &CPUIDEmu::Function_Reserved, }; static constexpr std::array Extended_Constant = {{ // Largest extended function number {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // Processor vendor {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // Processor brand string {SupportsConstant::NONCONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // Processor brand string continued {SupportsConstant::NONCONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // Processor brand string continued {SupportsConstant::NONCONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #ifdef CPUID_AMD // 0x8000'0005: L1 Cache and TLB identifiers {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #else // 0x8000'0005: Reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #endif // 0x8000'0006: L2 Cache identifiers {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0007: Advanced power management information {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0008: Virtual and physical address sizes {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0009: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'000A: SVM Revision {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'000B: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'000C: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'000D: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'000E: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'000F: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0010: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0011: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0012: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0013: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0014: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0015: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0016: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0017: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0018: Reserved? {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'0019: TLB 1GB page identifiers {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'001A: Performance optimization identifiers {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'001B: Instruction based sampling identifiers {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'001C: Lightweight profiling capabilities {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #ifdef CPUID_AMD // 0x8000'001D: Cache properties {SupportsConstant::CONSTANT, NeedsLeafConstant::NEEDSLEAFCONSTANT}, #else // 0x8000'001D: Reserved {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, #endif // 0x8000'001E: Extended APIC ID {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, // 0x8000'001F: AMD Secure Encryption {SupportsConstant::CONSTANT, NeedsLeafConstant::NOLEAFCONSTANT}, }}; using GetCPUIDPtr = uint32_t (*)(); GetCPUIDPtr GetCPUID; }; } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Core/CodeCache.cpp ================================================ // SPDX-License-Identifier: MIT #include "Utils/SpinWaitLock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore { #if __clang_major__ < 16 ExecutableFileInfo::ExecutableFileInfo(fextl::unique_ptr Map, uint64_t FileId, fextl::string Filename) : SourcecodeMap(std::move(Map)) , FileId(FileId) , Filename(Filename) {} #endif ExecutableFileInfo::~ExecutableFileInfo() = default; fextl::string CodeMap::GetBaseFilename(const ExecutableFileInfo& MainExecutable, bool AddNombSuffix) { auto FileId = MainExecutable.FileId; std::string_view base_filename = FHU::Filesystem::GetFilename(std::string_view {MainExecutable.Filename}); if (FileId != 0xffff'ffff'ffff'ffff) { return fextl::fmt::format("{}-{:016x}{}", base_filename, MainExecutable.FileId, AddNombSuffix ? "-nomb" : ""); } return ""; } fextl::map CodeMap::ParseCodeMap(std::ifstream& File) { fextl::map Ret; while (true) { Entry Entry; File.read(reinterpret_cast(&Entry), sizeof(Entry)); if (!File) { break; } if (Entry.FileId == LoadExternalLibrary.FileId && Entry.BlockOffset == LoadExternalLibrary.BlockOffset) { ExternalLibraryInfo Info; File.read(reinterpret_cast(&Info), sizeof(Info)); fextl::string Filename; std::getline(File, Filename, '\0'); // Align to 4-byte boundary char Null[4]; File.read(Null, AlignUp(Filename.size() + 1, 4) - Filename.size() - 1); if (!File) { break; } Ret[Info.ExternalFileId].Filename = std::move(Filename); } else if (Entry.FileId == SetExecutableFileId {}.Marker.FileId && Entry.BlockOffset == SetExecutableFileId {}.Marker.BlockOffset) { CodeMapFileId ExecutableFileId; File.read(reinterpret_cast(&ExecutableFileId), sizeof(ExecutableFileId)); if (!File) { break; } Ret[ExecutableFileId].IsExecutable = true; } else { if (!Ret.contains(Entry.FileId)) { LogMan::Msg::EFmt("Code map referenced unknown file id {:016x}", Entry.FileId); } else { Ret[Entry.FileId].Blocks.insert(Entry.BlockOffset); } } if (!File) { break; } } return Ret; } CodeMapWriter::CodeMapWriter(CodeMapOpener& Opener, bool OpenEagerly) : Buffer(4096) , FileOpener(Opener) { if (OpenEagerly) { CodeMapFD = FileOpener.OpenCodeMapFile(); } } CodeMapWriter::~CodeMapWriter() { if (CodeMapFD.value_or(-1) != -1) { Flush(BufferOffset); close(*CodeMapFD); } } bool CodeMapWriter::IsWriteEnabled(const ExecutableFileSectionInfo& Section) { if (CodeMapFD == -1) { return false; } // PV libraries can't yet be read by FEXServer, so skip dumping them if (Section.FileInfo.Filename.starts_with("/run/pressure-vessel")) { return false; } if (CodeMapFD) { return true; } // Acquire mutex and re-check CodeMapFD to avoid race conditions auto lk = std::unique_lock {Mutex}; if (!CodeMapFD) { CodeMapFD = FileOpener.OpenCodeMapFile(); } return CodeMapFD != -1; } void CodeMapWriter::Flush(size_t Offset) { // Acquire exclusive lock and flush circular buffer std::unique_lock Lock {Mutex}; Flush(Offset, Lock); } void CodeMapWriter::Flush(size_t Offset, std::unique_lock&) { write(*CodeMapFD, Buffer.data(), Offset); BufferOffset = 0; } void CodeMapWriter::AppendBlock(const FEXCore::ExecutableFileSectionInfo& SectionInfo, uint64_t BlockEntry) { if (!IsWriteEnabled(SectionInfo)) { return; } BlockEntry -= SectionInfo.FileStartVA; if (BlockEntry > std::numeric_limits::max()) { ERROR_AND_DIE_FMT("Cannot write code map"); } // Register new library if not already known bool NewLibraryLoad = false; { // Check prior registration with shared lock std::shared_lock Lock {Mutex}; NewLibraryLoad = !KnownFileIds.contains(SectionInfo.FileInfo.FileId); } if (NewLibraryLoad) { // Register to map with exclusive lock std::unique_lock Lock {Mutex}; NewLibraryLoad &= KnownFileIds.insert(SectionInfo.FileInfo.FileId).second; } if (NewLibraryLoad) { // Add entry to code map AppendLibraryLoad(SectionInfo.FileInfo); } // Register the actual code block CodeMap::Entry DataEntry {SectionInfo.FileInfo.FileId, static_cast(BlockEntry)}; AppendData(std::as_bytes(std::span {&DataEntry, 1})); } void CodeMapWriter::AppendLibraryLoad(const FEXCore::ExecutableFileInfo& FileInfo) { // See CodeMap::ExternalLibraryInfo auto ExternalFileId = FileInfo.FileId; auto TotalSize = AlignUp(sizeof(CodeMap::LoadExternalLibrary) + sizeof(ExternalFileId) + FileInfo.Filename.size() + 1, 4); const auto Data = reinterpret_cast(alloca(TotalSize)); auto WritePtr = std::copy_n(reinterpret_cast(&CodeMap::LoadExternalLibrary), sizeof(CodeMap::LoadExternalLibrary), Data); WritePtr = std::copy_n(reinterpret_cast(&ExternalFileId), sizeof(ExternalFileId), WritePtr); WritePtr = std::copy(FileInfo.Filename.begin(), FileInfo.Filename.end(), WritePtr); std::fill(WritePtr, Data + TotalSize, 0); AppendData(std::as_bytes(std::span {Data, TotalSize})); } void CodeMapWriter::AppendSetMainExecutable(const FEXCore::ExecutableFileInfo& FileInfo) { CodeMap::SetExecutableFileId Data {.ExecutableFileId = FileInfo.FileId}; AppendData(std::span {reinterpret_cast(&Data), sizeof(Data)}); } void CodeMapWriter::AppendData(std::span Data) { std::shared_lock Lock {Mutex}; auto Offset = BufferOffset.fetch_add(Data.size_bytes()); if (Offset + Data.size_bytes() > Buffer.size()) { // Acquire exclusive lock and flush the buffer. // Under heavy pressure, multiple threads may observe an exhausted buffer simultaneously. // The thread with the last in-bounds Offset is responsible for flushing the buffer. Lock.unlock(); bool IsResponsibleForFlush = false; { std::unique_lock ExclusiveLock {Mutex}; IsResponsibleForFlush = (Offset <= Buffer.size()); if (IsResponsibleForFlush) { Flush(Offset, ExclusiveLock); } } if (!IsResponsibleForFlush) { // Wait for the buffer to be flushed on the responsible thread Utils::SpinWaitLock::WaitPred, size_t>(reinterpret_cast(&BufferOffset), Buffer.size()); } AppendData(Data); return; } memcpy(&Buffer.at(Offset), Data.data(), Data.size_bytes()); } } // namespace FEXCore namespace FEXCore::Context { CodeCache::CodeCache(ContextImpl& CTX_) : CTX(CTX_) {} CodeCache::~CodeCache() = default; uint64_t CodeCache::ComputeCodeMapId(std::string_view Filename, int FD) { if (Filename.empty()) { return 0xffff'ffff'ffff'ffff; } // For now, we just use the file path as an identifier. // TODO: Ensure the hash is unique enough to distinguish executables while remaining independent of the installation location return XXH3_64bits(Filename.data(), Filename.size()); } struct CodeCacheHeader { std::array Magic = ExpectedMagic; uint32_t FormatVersion = 1; uint8_t FEXVersion[20] = {}; uint32_t NumBlocks; uint32_t NumCodePages; uint32_t CodeBufferSize; uint32_t NumRelocations; uint32_t padding; uint64_t SerializedBaseAddress; // TODO: Consider including information from LookupCache.BlockLinks static constexpr std::array ExpectedMagic = {'F', 'X', 'C', 'C'}; }; template concept OrderedContainer = requires { typename T::key_compare; }; bool CodeCache::SaveData(Core::InternalThreadState& Thread, int fd, const ExecutableFileSectionInfo& SourceBinary, uint64_t SerializedBaseAddress) { auto CodeBuffer = CTX.GetLatest(); auto& LookupCache = *Thread.LookupCache->Shared; auto Relocations = Thread.CPUBackend->TakeRelocations(SourceBinary.FileStartVA); // Write file header CodeCacheHeader header {}; static_assert(GIT_HASH.size() == sizeof(header.FEXVersion)); std::ranges::copy(GIT_HASH, header.FEXVersion); header.NumBlocks = LookupCache.BlockList.size(); header.NumCodePages = LookupCache.CodePages.size(); header.CodeBufferSize = CTX.LatestOffset; header.NumRelocations = Relocations.size(); header.SerializedBaseAddress = SerializedBaseAddress; ::write(fd, &header, sizeof(header)); // Dump guest<->host block mappings { // Cache contents must be deterministic, so copy the unordered block list and then sort by key static_assert(!OrderedContainer, "Already deterministic; drop temporary container"); fextl::vector> BlockList; BlockList.reserve(LookupCache.BlockList.size()); for (auto& [Guest, BlockEntry] : LookupCache.BlockList) { static_assert(sizeof(Guest) == 8, "Breaking change in code cache data layout"); BlockList.emplace_back(Guest, &BlockEntry); } std::ranges::sort(BlockList); for (auto [Guest, Host] : BlockList) { static_assert(sizeof(Host->HostCode) == 8, "Breaking change in code cache data layout"); static_assert(sizeof(Host->CodePages[0]) == 8, "Breaking change in code cache data layout"); Guest -= SourceBinary.FileStartVA; ::write(fd, &Guest, sizeof(Guest)); uint64_t HostCode = Host->HostCode - reinterpret_cast(CodeBuffer->Ptr); ::write(fd, &HostCode, sizeof(HostCode)); uint64_t NumCodePages = Host->CodePages.size(); ::write(fd, &NumCodePages, sizeof(NumCodePages)); LOGMAN_THROW_A_FMT(std::ranges::is_sorted(Host->CodePages), "Code pages aren't sorted"); for (auto CodePage : Host->CodePages) { CodePage -= SourceBinary.FileStartVA; ::write(fd, &CodePage, sizeof(CodePage)); } } } // Dump relocations static_assert(sizeof(Relocations[0]) == 48, "Breaking change in code cache data layout"); ::write(fd, Relocations.data(), Relocations.size() * sizeof(Relocations[0])); // Pad to next page in file so that the CodeBuffer can be mmap'ed into process on load char Zero[64] {}; auto Off = lseek(fd, 0, SEEK_CUR); while (Off != AlignUp(Off, Utils::FEX_PAGE_SIZE)) { auto BytesToWrite = std::min(AlignUp(Off, Utils::FEX_PAGE_SIZE) - Off, sizeof(Zero)); ::write(fd, Zero, BytesToWrite); Off += BytesToWrite; } // Dump the host code (relocated for position-independent serialization) std::span CodeBufferData(reinterpret_cast(CodeBuffer->Ptr), reinterpret_cast(CodeBuffer->Ptr) + CTX.LatestOffset); if (!ApplyCodeRelocations(SerializedBaseAddress, CodeBufferData, Relocations, true)) { LOGMAN_THROW_A_FMT(false, "Failed to apply code relocations"); return false; } ::write(fd, CodeBufferData.data(), CodeBufferData.size()); // Dump code pages static_assert(OrderedContainer, "Non-deterministic data source"); for (const auto& [PageIndex, Entrypoints] : LookupCache.CodePages) { uint64_t PageAddr = (PageIndex << 12) - SourceBinary.FileStartVA; ::write(fd, &PageAddr, sizeof(PageAddr)); uint64_t NumEntrypoints = Entrypoints.size(); ::write(fd, &NumEntrypoints, sizeof(NumEntrypoints)); for (uint64_t Entrypoint : Entrypoints) { Entrypoint -= SourceBinary.FileStartVA; ::write(fd, &Entrypoint, sizeof(Entrypoint)); } } return true; } bool CodeCache::LoadData(Core::InternalThreadState* Thread, std::byte* MappedCacheFile, const ExecutableFileSectionInfo& BinarySection) { if (!EnableCodeCaching) { return true; } namespace ranges = std::ranges; // Read file header CodeCacheHeader header {}; ::memcpy(&header, MappedCacheFile, sizeof(header)); MappedCacheFile += sizeof(header); LogMan::Msg::IFmt("Cache load: {:5} blocks; base={:#14x}; off={:#9x}-{:#09x}; {:016x} {}", header.NumBlocks, BinarySection.FileStartVA, BinarySection.BeginVA - BinarySection.FileStartVA, BinarySection.EndVA - BinarySection.FileStartVA, BinarySection.FileInfo.FileId, BinarySection.FileInfo.Filename); if (!ranges::equal(header.Magic, header.ExpectedMagic)) { LogMan::Msg::EFmt("Invalid cache file header"); return false; } if (!ranges::equal(header.FEXVersion, GIT_HASH)) { LogMan::Msg::IFmt("Cache generated from old FEX version {:02x}, current is {:02x}; skipping", fmt::join(header.FEXVersion, ""), fmt::join(GIT_HASH, "")); return false; } if (header.NumBlocks == 0) { // Valid caches are never empty LogMan::Msg::IFmt("Code cache empty, aborting"); return false; } // Read guest<->host block mappings using BlockListEntry = decltype(GuestToHostMap::BlockList)::value_type; fextl::vector BlockList(header.NumBlocks); { for (auto& BlockPtr : BlockList) { ::memcpy(&BlockPtr.first, MappedCacheFile, sizeof(BlockPtr.first)); MappedCacheFile += sizeof(BlockPtr.first); ::memcpy(&BlockPtr.second.HostCode, MappedCacheFile, sizeof(BlockPtr.second.HostCode)); MappedCacheFile += sizeof(BlockPtr.second.HostCode); uint64_t NumGuestPages; ::memcpy(&NumGuestPages, MappedCacheFile, sizeof(NumGuestPages)); MappedCacheFile += sizeof(NumGuestPages); BlockPtr.second.CodePages.resize(NumGuestPages); ::memcpy(BlockPtr.second.CodePages.data(), MappedCacheFile, std::span {BlockPtr.second.CodePages}.size_bytes()); MappedCacheFile += std::span {BlockPtr.second.CodePages}.size_bytes(); } // Consistency check: VMA regions at the top and end should belong to the same file auto [min_val, max_val] = ranges::minmax_element(BlockList, std::less {}, &decltype(BlockList)::value_type::first); auto MinBound = CTX.SyscallHandler->LookupExecutableFileSection(Thread, min_val->first + BinarySection.FileStartVA); auto MaxBound = CTX.SyscallHandler->LookupExecutableFileSection(Thread, max_val->first + BinarySection.FileStartVA); if (&MinBound->FileInfo != &BinarySection.FileInfo || &MaxBound->FileInfo != &BinarySection.FileInfo) { ERROR_AND_DIE_FMT("Cached blocks offsets {:#x}-{:#x} out of bounds for guest library {} ({:016x} @ {:#x}) while trying to load " "section {:#x}-{:#x}!", min_val->first, max_val->first, BinarySection.FileInfo.Filename, BinarySection.FileInfo.FileId, BinarySection.FileStartVA, BinarySection.BeginVA, BinarySection.EndVA); } // Constrain BlockList to the given ExecutableFileSectionInfo LOGMAN_THROW_A_FMT(ranges::is_sorted(BlockList, [](auto& a, auto& b) { return a.first < b.first; }), "Expected sorted block list"); auto begin = ranges::lower_bound(BlockList, BinarySection.BeginVA - BinarySection.FileStartVA, std::less {}, &BlockListEntry::first); auto end = ranges::upper_bound(begin, BlockList.end(), BinarySection.EndVA - BinarySection.FileStartVA - 1, std::less {}, &BlockListEntry::first); BlockList.erase(end, BlockList.end()); BlockList.erase(BlockList.begin(), begin); if (BlockList.empty()) { // Not an error since there is just no data to load LogMan::Msg::IFmt("No blocks cached in this range, aborting"); return true; } } // Read relocations fextl::vector Relocations(header.NumRelocations, FEXCore::CPU::Relocation::Default()); ::memcpy(Relocations.data(), MappedCacheFile, Relocations.size() * sizeof(Relocations[0])); MappedCacheFile += Relocations.size() * sizeof(Relocations[0]); // Pad to next page in file, which contains CodeBuffer data MappedCacheFile = reinterpret_cast(AlignUp(reinterpret_cast(MappedCacheFile), Utils::FEX_PAGE_SIZE)); // Prepare CodeBuffer: Page aligned and big enough to hold all cached data auto Lock = std::unique_lock {CTX.CodeBufferWriteMutex}; if (Thread) { if (auto Prev = Thread->CPUBackend->CheckCodeBufferUpdate()) { Allocator::VirtualDontNeed(Thread->CallRetStackBase, FEXCore::Core::InternalThreadState::CALLRET_STACK_SIZE); auto lk = Thread->LookupCache->AcquireWriteLock(); Thread->LookupCache->ChangeGuestToHostMapping(*Prev, *CTX.GetLatest()->LookupCache, lk); } } auto CodeBuffer = CTX.GetLatest(); LOGMAN_THROW_A_FMT(reinterpret_cast(CodeBuffer->Ptr) % 0x1000 == 0, "Expected CodeBuffer base to be page-aligned"); const auto Delta = AlignUp(CTX.LatestOffset, 0x1000) - CTX.LatestOffset; CTX.LatestOffset += Delta; while (CTX.LatestOffset + header.CodeBufferSize > CodeBuffer->UsableSize()) { if (Thread) { CTX.ClearCodeCache(Thread); CodeBuffer = CTX.GetLatest(); LogMan::Msg::IFmt("Increased code buffer size to {} MiB for cache load", CodeBuffer->AllocatedSize / 1024 / 1024); } else { ERROR_AND_DIE_FMT("Cannot extend codebuffer without thread!"); } } // Read CodeBuffer data from file. Make sure the destination is page-aligned. // TODO: Only load the data needed for the selected section auto CodeBufferRange = std::as_writable_bytes(std::span {CodeBuffer->Ptr, CodeBuffer->UsableSize()}).subspan(CTX.LatestOffset, header.CodeBufferSize); ::memcpy(CodeBufferRange.data(), MappedCacheFile, header.CodeBufferSize); MappedCacheFile += header.CodeBufferSize; CTX.LatestOffset += header.CodeBufferSize; // Apply FEX relocations auto Ret = ApplyCodeRelocations(BinarySection.FileStartVA, CodeBufferRange, Relocations, false); LOGMAN_THROW_A_FMT(Ret == true, "Failed to apply code cache relocations"); { auto& LookupCache = *CodeBuffer->LookupCache; auto WriteLock = LookupCache.AcquireWriteLock(); // Register blocks to LookupCache for (auto& [Guest, Host] : BlockList) { for (auto& CodePage : Host.CodePages) { CodePage += BinarySection.FileStartVA; } auto HostCode = reinterpret_cast(Host.HostCode + reinterpret_cast(CodeBufferRange.data())); LookupCache.AddBlockMapping(Guest + BinarySection.FileStartVA, std::move(Host.CodePages), HostCode, WriteLock); } // Register loaded code ranges fextl::vector Entrypoints; for (uint32_t i = 0; i < header.NumCodePages; ++i) { uint64_t CodePage; memcpy(&CodePage, MappedCacheFile, sizeof(CodePage)); CodePage += BinarySection.FileStartVA; MappedCacheFile += sizeof(CodePage); uint64_t NumEntrypoints; memcpy(&NumEntrypoints, MappedCacheFile, sizeof(NumEntrypoints)); MappedCacheFile += sizeof(NumEntrypoints); Entrypoints.resize(NumEntrypoints); memcpy(Entrypoints.data(), MappedCacheFile, NumEntrypoints * sizeof(Entrypoints[0])); MappedCacheFile += NumEntrypoints * sizeof(Entrypoints[0]); for (auto& Entrypoint : Entrypoints) { Entrypoint += BinarySection.FileStartVA; } if (LookupCache.AddBlockExecutableRange(Entrypoints, CodePage, FEXCore::Utils::FEX_PAGE_SIZE, WriteLock)) { CTX.SyscallHandler->MarkGuestExecutableRange(Thread, CodePage, FEXCore::Utils::FEX_PAGE_SIZE); } } } if (EnableCodeCacheValidation) { fextl::set GuestBlocks, HostBlocks; for (auto& [Guest, Host] : BlockList) { GuestBlocks.insert(Guest + BinarySection.FileStartVA); HostBlocks.insert(Host.HostCode); } Validate(BinarySection, std::move(GuestBlocks), HostBlocks, CodeBufferRange); } return true; } void CodeCache::Validate(const ExecutableFileSectionInfo& Section, fextl::set GuestBlocks, const fextl::set& HostBlocks, std::span CachedCode) { LOGMAN_THROW_A_FMT(!HostBlocks.empty(), "Tried to validate without any host blocks"); // Skip any cached data before the first host block CachedCode = CachedCode.subspan(*HostBlocks.begin() - sizeof(CPU::CPUBackend::JITCodeHeader)); if (!ValidationCTX) { ValidationCTX.reset(static_cast(FEXCore::Context::Context::CreateNewContext(CTX.HostFeatures).release())); ValidationCTX->SetSignalDelegator(CTX.SignalDelegation); ValidationCTX->SetSyscallHandler(CTX.SyscallHandler); ValidationCTX->SetThunkHandler(CTX.ThunkHandler); if (!ValidationCTX->InitCore()) { ERROR_AND_DIE_FMT("Failed to create cache load validation context"); } ValidationThread.reset(ValidationCTX->CreateThread(0, 0, nullptr)); auto Frame = ValidationThread->CurrentFrame; Frame->State.segment_arrays[FEXCore::Core::CPUState::SEGMENT_ARRAY_INDEX_GDT] = &ValidationGDT[0]; Frame->State.segment_arrays[FEXCore::Core::CPUState::SEGMENT_ARRAY_INDEX_LDT] = &ValidationGDT[0]; Frame->State.cs_idx = 0; Frame->State.cs_cached = 0; if (ValidationCTX->Config.Is64BitMode()) { ValidationGDT[0].L = 1; // L = Long Mode = 64-bit ValidationGDT[0].D = 0; // D = Default Operand Size = Reserved } else { ValidationGDT[0].L = 0; // L = Long Mode = 32-bit ValidationGDT[0].D = 1; // D = Default Operand Size = 32-bit } } auto NewCodeBuffer = ValidationCTX->GetLatest(); while (CachedCode.size_bytes() > NewCodeBuffer->UsableSize()) { ValidationCTX->ClearCodeCache(ValidationThread.get()); NewCodeBuffer = ValidationCTX->GetLatest(); LogMan::Msg::IFmt("Increased cache validation code buffer size to {} MiB", NewCodeBuffer->AllocatedSize / 1024 / 1024); } std::span CodeBufferRangeRef = std::as_writable_bytes(std::span {NewCodeBuffer->Ptr, NewCodeBuffer->Ptr + NewCodeBuffer->UsableSize()}).subspan(0, CachedCode.size_bytes()); while (!GuestBlocks.empty()) { auto [CompiledBlocks, _, _2, _3, _4] = ValidationCTX->CompileCode(ValidationThread.get(), *GuestBlocks.begin(), 0 /* TODO: Set MaxInst? */); for (auto& Entry : CompiledBlocks.EntryPoints) { GuestBlocks.erase(Entry.first); } } // Patch FEX-internal function addresses with values from the main Context to ensure the code blocks are comparable auto NewRelocations = ValidationThread->CPUBackend->TakeRelocations(Section.FileStartVA); NewRelocations.erase(std::remove_if(NewRelocations.begin(), NewRelocations.end(), [](const CPU::Relocation& Reloc) { return Reloc.Header.Type != CPU::RelocationTypes::RELOC_NAMED_SYMBOL_LITERAL && Reloc.Header.Type != CPU::RelocationTypes::RELOC_NAMED_THUNK_MOVE; })); (void)ApplyCodeRelocations(Section.FileStartVA, CodeBufferRangeRef, NewRelocations, false); if (ValidationCTX->LatestOffset <= CodeBufferRangeRef.size()) { // Reference compilation produced fewer bytes than our cache, so validation is going to fail. // Make sure we don't output any garbage bytes though. CodeBufferRangeRef = CodeBufferRangeRef.subspan(0, ValidationCTX->LatestOffset); } auto [Mismatch, _] = std::mismatch(CodeBufferRangeRef.begin(), CodeBufferRangeRef.end(), CachedCode.begin()); if (Mismatch != CodeBufferRangeRef.end()) { // Align down to instruction size auto Idx = AlignDown(std::distance(CodeBufferRangeRef.begin(), Mismatch), 4); auto BlockIt = std::prev(HostBlocks.lower_bound(*HostBlocks.begin() + Idx + 1)); std::optional GuestBlockAddr; std::optional GuestBlockAddrRef; if (BlockIt != HostBlocks.end()) { for (int i : {0, 1}) { std::span Buffer = (i == 0 ? CachedCode : CodeBufferRangeRef); // Second instruction is always a constant load for relative offset to the (multi)block start int32_t addr = (*reinterpret_cast(&Buffer[*BlockIt - *HostBlocks.begin() + 4]) & 0x3ff'ffe0) << 11; addr >>= 14; auto header = reinterpret_cast(&Buffer[*BlockIt - *HostBlocks.begin() + 4 + addr]); auto tail = reinterpret_cast(reinterpret_cast(header) + header->OffsetToBlockTail); (i == 0 ? GuestBlockAddr : GuestBlockAddrRef) = tail->RIP - Section.FileStartVA; LogMan::Msg::EFmt("Recorded rip {}: {:#x} (offset {:#x})", i, tail->RIP, tail->RIP - Section.FileStartVA); if (i == 1) { if (tail->RIP >= Section.BeginVA && tail->RIP < Section.EndVA) { auto [IRView, TotalInstructions, TotalInstructionsLength, StartAddr, Length, _] = ValidationCTX->GenerateIR(ValidationThread.get(), tail->RIP, false, FEXCore::Config::Get_MAXINST()); fextl::stringstream ss; FEXCore::IR::Dump(&ss, &*IRView); LogMan::Msg::EFmt("IR:\n{}", ss.str()); } else { LogMan::Msg::EFmt("Can't dump IR for out-of-range RIP {:#x}", tail->RIP); } } } } fextl::string GuestBlockInfo = "UNKNOWN"; if (GuestBlockAddr) { GuestBlockInfo = fextl::fmt::format("{:#x}", GuestBlockAddr.value()); } if (GuestBlockAddr != GuestBlockAddrRef) { GuestBlockInfo += " (MISMATCH)"; } ERROR_AND_DIE_FMT("Cache validation failed at offset {:#x}: {:02x} <-> {:02x} (at {} <-> {}, guest block {})", Idx, fmt::join(CachedCode.subspan(Idx, 4), ""), fmt::join(CodeBufferRangeRef.subspan(Idx, 4), ""), fmt::ptr(CachedCode.data()), fmt::ptr(CodeBufferRangeRef.data()), GuestBlockInfo); } // Reset Context state for next validation ValidationThread->LookupCache->ClearCache(ValidationThread->LookupCache->AcquireWriteLock()); ValidationCTX->LatestOffset = 0; LogMan::Msg::IFmt("\tSuccessfully validated cache"); } bool CodeCache::ApplyCodeRelocations(uint64_t GuestEntry, std::span Code, std::span EntryRelocations, bool ForStorage) { CPU::Arm64Emitter Emitter(&CTX, Code.data(), Code.size_bytes()); for (size_t j = 0; j < EntryRelocations.size(); ++j) { const FEXCore::CPU::Relocation& Reloc = EntryRelocations[j]; Emitter.SetCursorOffset(Reloc.Header.Offset); switch (Reloc.Header.Type) { case FEXCore::CPU::RelocationTypes::RELOC_NAMED_SYMBOL_LITERAL: { // Generate a literal so we can place it uint64_t Pointer = ForStorage ? 0 : GetNamedSymbolLiteral(CTX, Reloc.NamedSymbolLiteral.Symbol); Emitter.dc64(Pointer); break; } case FEXCore::CPU::RelocationTypes::RELOC_NAMED_THUNK_MOVE: { uint64_t Pointer = ForStorage ? 0 : reinterpret_cast(CTX.ThunkHandler->LookupThunk(Reloc.NamedThunkMove.Symbol)); if (Pointer == ~0ULL) { return false; } // TODO: Pointers are required to fit within 48-bit VA space. // But forcing 6-byte broke relocations. Emitter.LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Register(Reloc.NamedThunkMove.RegisterIndex), Pointer, CPU::Arm64Emitter::PadType::DOPAD); break; } case FEXCore::CPU::RelocationTypes::RELOC_GUEST_RIP_LITERAL: { Emitter.dc64(GuestEntry + Reloc.GuestRIP.GuestRIP); break; } case FEXCore::CPU::RelocationTypes::RELOC_GUEST_RIP_MOVE: { uint64_t Pointer = Reloc.GuestRIP.GuestRIP + GuestEntry; // TODO: Pointers are required to fit within 48-bit VA space. // But forcing 6-byte broke relocations. Emitter.LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Register(Reloc.GuestRIP.RegisterIndex), Pointer, CPU::Arm64Emitter::PadType::DOPAD); break; } default: ERROR_AND_DIE_FMT("Unknown relocation type {}", ToUnderlying(Reloc.Header.Type)); } } return true; } } // namespace FEXCore::Context ================================================ FILE: FEXCore/Source/Interface/Core/Core.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ category: glue ~ Logic that binds various parts together meta: glue|driver ~ Emulation mainloop related glue logic tags: glue|driver desc: Glues Frontend, OpDispatcher and IR Opts & Compilation, LookupCache, Dispatcher and provides the Execution loop entrypoint $end_info$ */ #include #ifdef ZYDIS_DISASSEMBLER #include #endif #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/LookupCache.h" #include "Interface/Core/CPUBackend.h" #include "Interface/Core/CPUID.h" #include "Interface/Core/Frontend.h" #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/JIT/JITClass.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/Passes/RegisterAllocationPass.h" #include "Interface/IR/Passes.h" #include "Interface/IR/PassManager.h" #include "Interface/IR/RegisterAllocationData.h" #include "Utils/Allocator.h" #include "Utils/Allocator/HostAllocator.h" #include "Utils/SpinWaitLock.h" #include "Utils/variable_length_integer.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "FEXCore/Utils/SignalScopeGuards.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::Context { ContextImpl::ContextImpl(const FEXCore::HostFeatures& Features) : HostFeatures {Features} , CPUID {this} , CodeCache {*this} { if (!Config.Is64BitMode()) { // When operating in 32-bit mode, the virtual memory we care about is only the lower 32-bits. Config.VirtualMemSize = 1ULL << 32; } if (Config.BlockJITNaming() || Config.GlobalJITNaming() || Config.LibraryJITNaming()) { // Only initialize symbols file if enabled. Ensures we don't pollute /tmp with empty files. Symbols.InitFile(); } uint64_t FrequencyCounter = FEXCore::GetCycleCounterFrequency(); if (FrequencyCounter && FrequencyCounter < FEXCore::Context::TSC_SCALE_MAXIMUM && Config.SmallTSCScale()) { // Scale TSC until it is at the minimum required. while (FrequencyCounter < FEXCore::Context::TSC_SCALE_MAXIMUM) { FrequencyCounter <<= 1; ++Config.TSCScale; } } // Track atomic TSO emulation configuration. UpdateAtomicTSOEmulationConfig(); } struct GetFrameBlockInfoResult { const CPU::CPUBackend::JITCodeHeader* InlineHeader; const CPU::CPUBackend::JITCodeTail* InlineTail; }; static GetFrameBlockInfoResult GetFrameBlockInfo(FEXCore::Core::CpuStateFrame* Frame) { const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; auto InlineHeader = reinterpret_cast(BlockBegin); if (InlineHeader) { auto InlineTail = reinterpret_cast(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail); return {InlineHeader, InlineTail}; } return {InlineHeader, nullptr}; } bool ContextImpl::IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) { auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); return InlineTail && (Address + Size > InlineTail->RIP && Address < InlineTail->RIP + InlineTail->GuestSize); } bool ContextImpl::IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) { auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); return InlineTail && InlineTail->SingleInst; } uint64_t ContextImpl::GetGuestBlockEntry(FEXCore::Core::InternalThreadState* Thread) { auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); return InlineTail ? InlineTail->RIP : 0; } uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) { const auto Frame = Thread->CurrentFrame; const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; auto [InlineHeader, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); if (InlineHeader) { // Check if the host PC is currently within a code block. // If it is then RIP can be reconstructed from the beginning of the code block. // This is currently as close as FEX can get RIP reconstructions. if (HostPC >= reinterpret_cast(BlockBegin) && HostPC < reinterpret_cast(BlockBegin + InlineTail->Size)) { auto RIPEntry = reinterpret_cast(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries); // Reconstruct RIP from JIT entries for this block. uint64_t StartingHostPC = BlockBegin; uint64_t StartingGuestRIP = InlineTail->RIP; for (uint32_t i = 0; i < InlineTail->NumberOfRIPEntries; ++i) { auto Offset = FEXCore::Utils::vl64pair::Decode(RIPEntry); RIPEntry += Offset.Size; if (HostPC >= (StartingHostPC + Offset.IntegerARMPC)) { // We are beyond this entry, keep going forward. StartingHostPC += Offset.IntegerARMPC; StartingGuestRIP += Offset.IntegerX86RIP; } else { // Passed where the Host PC is at. Break now. break; } } return StartingGuestRIP; } } // Fallback to what is stored in the RIP currently. return Frame->State.rip; } uint32_t ContextImpl::ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, const uint64_t* HostGPRs, uint64_t PSTATE) { const auto Frame = Thread->CurrentFrame; uint32_t EFLAGS {}; // Currently these flags just map 1:1 inside of the resulting value. for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_EFLAG_BITS; ++i) { switch (i) { case X86State::RFLAG_CF_RAW_LOC: case X86State::RFLAG_PF_RAW_LOC: case X86State::RFLAG_AF_RAW_LOC: case X86State::RFLAG_TF_RAW_LOC: case X86State::RFLAG_ZF_RAW_LOC: case X86State::RFLAG_SF_RAW_LOC: case X86State::RFLAG_OF_RAW_LOC: case X86State::RFLAG_DF_RAW_LOC: // Intentionally do nothing. // These contain multiple bits which can corrupt other members when compacted. break; default: EFLAGS |= uint32_t {Frame->State.flags[i]} << i; break; } } uint32_t Packed_NZCV {}; if (WasInJIT) { // If we were in the JIT then NZCV is in the CPU's PSTATE object. // Packed in to the same bit locations as RFLAG_NZCV_LOC. Packed_NZCV = PSTATE; // If we were in the JIT then PF and AF are in registers. // Move them to the CPUState frame now. Frame->State.pf_raw = HostGPRs[CPU::REG_PF.Idx()]; Frame->State.af_raw = HostGPRs[CPU::REG_AF.Idx()]; } else { // If we were not in the JIT then the NZCV state is stored in the CPUState RFLAG_NZCV_LOC. // SF/ZF/CF/OF are packed in a 32-bit value in RFLAG_NZCV_LOC. memcpy(&Packed_NZCV, &Frame->State.flags[X86State::RFLAG_NZCV_LOC], sizeof(Packed_NZCV)); } uint32_t OF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_OF_RAW_LOC)) & 1; uint32_t CF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_CF_RAW_LOC)) & 1; uint32_t ZF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_ZF_RAW_LOC)) & 1; uint32_t SF = (Packed_NZCV >> IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_SF_RAW_LOC)) & 1; // CF is inverted in our representation, undo the invert here. CF ^= 1; // Pack in to EFLAGS EFLAGS |= OF << X86State::RFLAG_OF_RAW_LOC; EFLAGS |= CF << X86State::RFLAG_CF_RAW_LOC; EFLAGS |= ZF << X86State::RFLAG_ZF_RAW_LOC; EFLAGS |= SF << X86State::RFLAG_SF_RAW_LOC; // PF calculation is deferred, calculate it now. // Popcount the 8-bit flag and then extract the lower bit. uint32_t PFByte = Frame->State.pf_raw & 0xff; uint32_t PF = std::popcount(PFByte ^ 1) & 1; EFLAGS |= PF << X86State::RFLAG_PF_RAW_LOC; // AF calculation is deferred, calculate it now. // XOR with PF byte and extract bit 4. uint32_t AF = ((Frame->State.af_raw ^ PFByte) & (1 << 4)) ? 1 : 0; EFLAGS |= AF << X86State::RFLAG_AF_RAW_LOC; uint8_t TFByte = Frame->State.flags[X86State::RFLAG_TF_RAW_LOC]; EFLAGS |= (TFByte & 1) << X86State::RFLAG_TF_RAW_LOC; // DF is pretransformed, undo the transform from 1/-1 back to 0/1 uint8_t DFByte = Frame->State.flags[X86State::RFLAG_DF_RAW_LOC]; if (DFByte & 0x80) { EFLAGS |= 1 << X86State::RFLAG_DF_RAW_LOC; } return EFLAGS; } void ContextImpl::ReconstructXMMRegisters(const FEXCore::Core::InternalThreadState* Thread, __uint128_t* XMM_Low, __uint128_t* YMM_High) { const size_t MaximumRegisters = Config.Is64BitMode ? FEXCore::Core::CPUState::NUM_XMMS : 8; if (YMM_High != nullptr && HostFeatures.SupportsAVX) { const bool SupportsConvergedRegisters = HostFeatures.SupportsSVE256; if (SupportsConvergedRegisters) { ///< Output wants to de-interleave for (size_t i = 0; i < MaximumRegisters; ++i) { memcpy(&XMM_Low[i], &Thread->CurrentFrame->State.xmm.avx.data[i][0], sizeof(__uint128_t)); memcpy(&YMM_High[i], &Thread->CurrentFrame->State.xmm.avx.data[i][2], sizeof(__uint128_t)); } } else { ///< Matches what FEX wants with non-converged registers for (size_t i = 0; i < MaximumRegisters; ++i) { memcpy(&XMM_Low[i], &Thread->CurrentFrame->State.xmm.sse.data[i][0], sizeof(__uint128_t)); memcpy(&YMM_High[i], &Thread->CurrentFrame->State.avx_high[i][0], sizeof(__uint128_t)); } } } else { // Only support SSE, no AVX here, even if requested. memcpy(XMM_Low, Thread->CurrentFrame->State.xmm.sse.data, MaximumRegisters * sizeof(__uint128_t)); } } void ContextImpl::SetXMMRegistersFromState(FEXCore::Core::InternalThreadState* Thread, const __uint128_t* XMM_Low, const __uint128_t* YMM_High) { const size_t MaximumRegisters = Config.Is64BitMode ? FEXCore::Core::CPUState::NUM_XMMS : 8; if (YMM_High != nullptr && HostFeatures.SupportsAVX) { const bool SupportsConvergedRegisters = HostFeatures.SupportsSVE256; if (SupportsConvergedRegisters) { ///< Output wants to de-interleave for (size_t i = 0; i < MaximumRegisters; ++i) { memcpy(&Thread->CurrentFrame->State.xmm.avx.data[i][0], &XMM_Low[i], sizeof(__uint128_t)); memcpy(&Thread->CurrentFrame->State.xmm.avx.data[i][2], &YMM_High[i], sizeof(__uint128_t)); } } else { ///< Matches what FEX wants with non-converged registers for (size_t i = 0; i < MaximumRegisters; ++i) { memcpy(&Thread->CurrentFrame->State.xmm.sse.data[i][0], &XMM_Low[i], sizeof(__uint128_t)); memcpy(&Thread->CurrentFrame->State.avx_high[i][0], &YMM_High[i], sizeof(__uint128_t)); } } } else { // Only support SSE, no AVX here, even if requested. memcpy(Thread->CurrentFrame->State.xmm.sse.data, XMM_Low, MaximumRegisters * sizeof(__uint128_t)); } } void ContextImpl::SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) { const auto Frame = Thread->CurrentFrame; for (size_t i = 0; i < FEXCore::Core::CPUState::NUM_EFLAG_BITS; ++i) { switch (i) { case X86State::RFLAG_OF_RAW_LOC: case X86State::RFLAG_CF_RAW_LOC: case X86State::RFLAG_ZF_RAW_LOC: case X86State::RFLAG_SF_RAW_LOC: // Intentionally do nothing. break; case X86State::RFLAG_AF_RAW_LOC: // AF stored in bit 4 in our internal representation. It is also // XORed with byte 4 of the PF byte, but we write that as zero here so // we don't need any special handling for that. Frame->State.af_raw = (EFLAGS & (1U << i)) ? (1 << 4) : 0; break; case X86State::RFLAG_PF_RAW_LOC: // PF is inverted in our internal representation. Frame->State.pf_raw = (EFLAGS & (1U << i)) ? 0 : 1; break; case X86State::RFLAG_DF_RAW_LOC: // DF is encoded as 1/-1 Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 0xff : 1; break; default: Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 1 : 0; break; } } // Calculate packed NZCV. Note CF is inverted. uint32_t Packed_NZCV {}; Packed_NZCV |= (EFLAGS & (1U << X86State::RFLAG_OF_RAW_LOC)) ? 1U << IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_OF_RAW_LOC) : 0; Packed_NZCV |= (EFLAGS & (1U << X86State::RFLAG_CF_RAW_LOC)) ? 0 : 1U << IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_CF_RAW_LOC); Packed_NZCV |= (EFLAGS & (1U << X86State::RFLAG_ZF_RAW_LOC)) ? 1U << IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_ZF_RAW_LOC) : 0; Packed_NZCV |= (EFLAGS & (1U << X86State::RFLAG_SF_RAW_LOC)) ? 1U << IR::OpDispatchBuilder::IndexNZCV(X86State::RFLAG_SF_RAW_LOC) : 0; memcpy(&Frame->State.flags[X86State::RFLAG_NZCV_LOC], &Packed_NZCV, sizeof(Packed_NZCV)); // Reserved, Read-As-1, Write-as-1 Frame->State.flags[X86State::RFLAG_RESERVED_LOC] = 1; // Interrupt Flag. Can't be written by CPL-3 userland. Frame->State.flags[X86State::RFLAG_IF_LOC] = 1; } bool ContextImpl::InitCore() { // Initialize the CPU core signal handlers & DispatcherConfig Dispatcher = FEXCore::CPU::Dispatcher::Create(this); // Set up the SignalDelegator config since core is initialized. SignalDelegation->SetConfig(Dispatcher->MakeSignalDelegatorConfig()); #if defined(_WIN32) && !defined(ARCHITECTURE_arm64ec) // WOW64 always needs the interrupt fault check to be enabled. Config.NeedsPendingInterruptFaultCheck = true; #endif if (Config.GdbServer) { // If gdbserver is enabled then this needs to be enabled. Config.NeedsPendingInterruptFaultCheck = true; } return true; } void ContextImpl::HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) { static_cast(Thread->CTX)->Dispatcher->ExecuteJITCallback(Thread->CurrentFrame, RIP); } void ContextImpl::ExecuteThread(FEXCore::Core::InternalThreadState* Thread) { // Update the thread pointer for Thunk return to the latest. Thread->CurrentFrame->Pointers.ThunkCallbackRet = SignalDelegation->GetThunkCallbackRET(); Dispatcher->ExecuteDispatch(Thread->CurrentFrame); // If it is the parent thread that died then just leave // TODO: This doesn't make sense when the parent thread doesn't outlive its children } void ContextImpl::InitializeCompiler(FEXCore::Core::InternalThreadState* Thread) { Thread->OpDispatcher = fextl::make_unique(this); Thread->OpDispatcher->SetMultiblock(Config.Multiblock); Thread->LookupCache = fextl::make_unique(this); Thread->FrontendDecoder = fextl::make_unique(Thread); Thread->PassManager = fextl::make_unique(); Thread->CurrentFrame->State.L1Pointer = Thread->LookupCache->GetL1Pointer(); Thread->CurrentFrame->State.L1Mask = Thread->LookupCache->GetScaledL1PointerMask(); Thread->CurrentFrame->Pointers.L2Pointer = Thread->LookupCache->GetPagePointer(); Dispatcher->InitThreadPointers(Thread); Thread->PassManager->AddDefaultPasses(this); Thread->PassManager->AddDefaultValidationPasses(); Thread->PassManager->RegisterSyscallHandler(SyscallHandler); // Create CPU backend Thread->PassManager->InsertRegisterAllocationPass(this); Thread->CPUBackend = FEXCore::CPU::CreateArm64JITCore(this, Thread); Thread->PassManager->Finalize(); } FEXCore::Core::InternalThreadState* ContextImpl::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState) { FEXCore::Core::InternalThreadState* Thread = new FEXCore::Core::InternalThreadState { .CTX = this, }; FEXCore::Allocator::VirtualName("FEXMem_ThreadState", Thread, sizeof(*Thread)); Thread->CurrentFrame->State.gregs[X86State::REG_RSP] = StackPointer; Thread->CurrentFrame->State.rip = InitialRIP; // Copy over the new thread state to the new object if (NewThreadState) { memcpy(&Thread->CurrentFrame->State, NewThreadState, sizeof(FEXCore::Core::CPUState)); } // Set up the thread manager state Thread->CurrentFrame->Thread = Thread; InitializeCompiler(Thread); Thread->CurrentFrame->State.DeferredSignalRefCount.Store(0); if (Config.BlockJITNaming() || Config.GlobalJITNaming() || Config.LibraryJITNaming()) { // Allocate a JIT symbol buffer only if enabled. Thread->SymbolBuffer = JITSymbols::AllocateBuffer(); } return Thread; } void ContextImpl::DestroyThread(FEXCore::Core::InternalThreadState* Thread) { FEXCore::Allocator::VirtualProtect(&Thread->InterruptFaultPage, sizeof(Thread->InterruptFaultPage), Allocator::ProtectOptions::Read | Allocator::ProtectOptions::Write); delete Thread; } #ifndef _WIN32 void ContextImpl::UnlockAfterFork(FEXCore::Core::InternalThreadState* LiveThread, bool Child) { Allocator::UnlockAfterFork(LiveThread, Child); Profiler::PostForkAction(Child); if (Child) { if (CodeMapWriter) { CodeMapWriter->ResetAfterFork(); } CodeInvalidationMutex.StealAndDropActiveLocks(); if (Config.StrictInProcessSplitLocks) { StrictSplitLockMutex = 0; } } else { CodeInvalidationMutex.unlock(); if (Config.StrictInProcessSplitLocks) { FEXCore::Utils::SpinWaitLock::unlock(&StrictSplitLockMutex); } return; } } void ContextImpl::LockBeforeFork(FEXCore::Core::InternalThreadState* Thread) { CodeInvalidationMutex.lock(); Allocator::LockBeforeFork(Thread); if (Config.StrictInProcessSplitLocks) { FEXCore::Utils::SpinWaitLock::lock(&StrictSplitLockMutex); } } #endif void ContextImpl::OnCodeBufferAllocated(const fextl::shared_ptr& Buffer) { if (Config.GlobalJITNaming()) { Symbols.RegisterJITSpace(Buffer->Ptr, Buffer->AllocatedSize); } { std::scoped_lock lk {CodeBufferListLock}; CodeBufferList.emplace_back(Buffer); } } void ContextImpl::ClearCodeCache(FEXCore::Core::InternalThreadState* Thread, bool NewCodeBuffer) { FEXCORE_PROFILE_INSTANT("ClearCodeCache"); if (NewCodeBuffer) { // Allocate new CodeBuffer + L3 LookupCache and clear L1+L2 caches Thread->CPUBackend->ClearCache(); } else { // Clear L1+L2 cache of this thread, and clear L3 cache across any threads using it auto lk = Thread->LookupCache->AcquireWriteLock(); Thread->LookupCache->ClearCache(lk); } Allocator::VirtualDontNeed(Thread->CallRetStackBase, FEXCore::Core::InternalThreadState::CALLRET_STACK_SIZE); } static void IRDumper(FEXCore::Core::InternalThreadState* Thread, IR::IREmitter* IREmitter, uint64_t GuestRIP) { FEXCore::File::File FD = FEXCore::File::File::GetStdERR(); fextl::stringstream out; auto NewIR = IREmitter->ViewIR(); FEXCore::IR::Dump(&out, &NewIR); fextl::fmt::print(FD, "IR-ShouldDump-{} 0x{:x}:\n{}\n@@@@@\n", NewIR.PostRA() ? "post" : "pre", GuestRIP, out.str()); }; bool ContextImpl::CheckIfBlockIsCacheable(FEXCore::Core::InternalThreadState& Thread, uint64_t GuestRIP, uint64_t MaxInst) { return Thread.FrontendDecoder->CheckIfCacheable(Thread, reinterpret_cast(GuestRIP), GuestRIP, MaxInst); } ContextImpl::GenerateIRResult ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, bool ExtendedDebugInfo, uint64_t MaxInst) { FEXCORE_PROFILE_SCOPED("GenerateIR"); Thread->OpDispatcher->ResetWorkingList(); uint64_t TotalInstructions {0}; uint64_t TotalInstructionsLength {0}; bool HasCustomIR {}; if (HasCustomIRHandlers.load(std::memory_order_relaxed)) { std::shared_lock lk(CustomIRMutex); auto Handler = CustomIRHandlers.find(GuestRIP); if (Handler != CustomIRHandlers.end()) { TotalInstructions = 1; TotalInstructionsLength = 1; Handler->second.Handler(GuestRIP, Thread->OpDispatcher.get()); HasCustomIR = true; } } if (!HasCustomIR) { const uint8_t* GuestCode {}; GuestCode = reinterpret_cast(GuestRIP); bool HadDispatchError {false}; bool HadInvalidInst {false}; Thread->FrontendDecoder->DecodeInstructionsAtEntry(Thread, GuestCode, GuestRIP, MaxInst); auto BlockInfo = Thread->FrontendDecoder->GetDecodedBlockInfo(); auto CodeBlocks = &BlockInfo->Blocks; Thread->OpDispatcher->BeginFunction(GuestRIP, CodeBlocks, BlockInfo->TotalInstructionCount, BlockInfo->Is64BitMode, AreMonoHacksActive() && MonoBackpatcherBlock.load(std::memory_order_relaxed) == GuestRIP); const auto GPRSize = Thread->OpDispatcher->GetGPROpSize(); #ifdef ZYDIS_DISASSEMBLER const auto ZydisMachineMode = Config.Is64BitMode ? ZYDIS_MACHINE_MODE_LONG_64 : ZYDIS_MACHINE_MODE_LEGACY_32; if (FEXCore::Config::Get_X86DISASSEMBLE()) { const uint64_t DecodedMin = Thread->FrontendDecoder->DecodedMinAddress; const uint64_t DecodedMax = Thread->FrontendDecoder->DecodedMaxAddress; LogMan::Msg::IFmt("Guest x86 Begin (RIP={:#x}, {:#x}-{:#x})", GuestRIP, DecodedMin, DecodedMax); } #endif for (size_t j = 0; j < CodeBlocks->size(); ++j) { const FEXCore::Frontend::Decoder::DecodedBlocks& Block = CodeBlocks->at(j); #ifdef ZYDIS_DISASSEMBLER if (FEXCore::Config::Get_X86DISASSEMBLE() && CodeBlocks->size() > 1) { LogMan::Msg::IFmt(" Block {} Entry={:#x} NumInsts={}", j, Block.Entry, Block.NumInstructions); } #endif bool BlockInForceTSOValidRange = false; auto InstForceTSOIt = ForceTSOInstructions.end(); if (ForceTSOValidRanges.Contains({Block.Entry, Block.Entry + Block.Size})) { if (auto It = ForceTSOInstructions.lower_bound(Block.Entry); *It < Block.Entry + Block.Size) { InstForceTSOIt = It; BlockInForceTSOValidRange = true; } } // Set the block entry point Thread->OpDispatcher->SetNewBlockIfChanged(Block.Entry); uint64_t BlockInstructionsLength {}; // Reset any block-specific state Thread->OpDispatcher->StartNewBlock(); uint64_t InstsInBlock = Block.NumInstructions; if (InstsInBlock == 0) { // Special case for an empty instruction block. Thread->OpDispatcher->ExitFunction(Thread->OpDispatcher->_InlineEntrypointOffset(GPRSize, Block.Entry - GuestRIP)); } for (size_t i = 0; i < InstsInBlock; ++i) { uint64_t InstAddress = Block.Entry + BlockInstructionsLength; const FEXCore::X86Tables::X86InstInfo* TableInfo {nullptr}; const FEXCore::X86Tables::DecodedInst* DecodedInfo {nullptr}; TableInfo = Block.DecodedInstructions[i].TableInfo; DecodedInfo = &Block.DecodedInstructions[i]; #ifdef ZYDIS_DISASSEMBLER if (FEXCore::Config::Get_X86DISASSEMBLE()) { const uint8_t* InstBytes = reinterpret_cast(InstAddress); ZydisDisassembledInstruction ZydisInst; if (ZYAN_SUCCESS(ZydisDisassembleIntel(ZydisMachineMode, InstAddress, InstBytes, DecodedInfo->InstSize, &ZydisInst))) { LogMan::Msg::IFmt(" {:#x}: {}", InstAddress, ZydisInst.text); } else { LogMan::Msg::IFmt(" {:#x}: (decode failed, {} bytes)", InstAddress, DecodedInfo->InstSize); } } #endif bool IsLocked = DecodedInfo->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK; // Do a partial register cache flush before every instruction. This // prevents cross-instruction static register caching, while allowing // context load/stores to be optimized within a block. Theoretically, // this flush is not required for correctness, all mandatory flushes are // included in instruction-specific handlers. Instead, this is a blunt // heuristic to make the register cache less aggressive, as the current // RA generates bad code in common cases with tied registers otherwise. // // However, it makes our exception handling behaviour more predictable. // It is potentially correctness bearing in that sense, but that is a // side effect here and (if that behaviour is required) we should handle // that more explicitly later. Thread->OpDispatcher->FlushRegisterCache(true); if (ExtendedDebugInfo || Thread->OpDispatcher->CanHaveSideEffects(TableInfo, DecodedInfo)) { Thread->OpDispatcher->_GuestOpcode(InstAddress - GuestRIP); } if (Config.SMCChecks == FEXCore::Config::CONFIG_SMC_FULL || Block.ForceFullSMCDetection) { auto ExistingCodePtr = reinterpret_cast(Block.Entry + BlockInstructionsLength); auto InstAddressReg = Thread->OpDispatcher->_EntrypointOffset(GPRSize, InstAddress - GuestRIP); std::array CodeOriginal; memcpy(CodeOriginal.data(), ExistingCodePtr, DecodedInfo->InstSize); auto CodeChanged = Thread->OpDispatcher->_ValidateCode(CodeOriginal, InstAddressReg, DecodedInfo->InstSize); auto InvalidateCodeCond = Thread->OpDispatcher->CondJump(CodeChanged); auto CurrentBlock = Thread->OpDispatcher->GetCurrentBlock(); auto CodeWasChangedBlock = Thread->OpDispatcher->CreateNewCodeBlockAtEnd(); Thread->OpDispatcher->SetTrueJumpTarget(InvalidateCodeCond, CodeWasChangedBlock); Thread->OpDispatcher->SetCurrentCodeBlock(CodeWasChangedBlock); Thread->OpDispatcher->_ThreadRemoveCodeEntry(); Thread->OpDispatcher->ExitFunction(Thread->OpDispatcher->_InlineEntrypointOffset(GPRSize, InstAddress - GuestRIP)); auto NextOpBlock = Thread->OpDispatcher->CreateNewCodeBlockAfter(CurrentBlock); Thread->OpDispatcher->SetFalseJumpTarget(InvalidateCodeCond, NextOpBlock); Thread->OpDispatcher->SetCurrentCodeBlock(NextOpBlock); } if (TableInfo && TableInfo->OpcodeDispatcher.OpDispatch) { auto Fn = TableInfo->OpcodeDispatcher.OpDispatch; Thread->OpDispatcher->ResetHandledLock(); Thread->OpDispatcher->ResetDecodeFailure(); IR::ForceTSOMode ForceTSO = IR::ForceTSOMode::NoOverride; if (BlockInForceTSOValidRange) { if (InstForceTSOIt != ForceTSOInstructions.end() && *InstForceTSOIt == InstAddress) { ForceTSO = IR::ForceTSOMode::ForceEnabled; } else { ForceTSO = IR::ForceTSOMode::ForceDisabled; } } else if (DecodedInfo->Flags & X86Tables::DecodeFlags::FLAG_FORCE_TSO) { ForceTSO = IR::ForceTSOMode::ForceEnabled; } Thread->OpDispatcher->SetForceTSO(ForceTSO); std::invoke(Fn, Thread->OpDispatcher, DecodedInfo); if (Thread->OpDispatcher->HadDecodeFailure()) { HadDispatchError = true; } else { if (Thread->OpDispatcher->HasHandledLock() != IsLocked) { HadDispatchError = true; LogMan::Msg::EFmt("Missing LOCK HANDLER at 0x{:x}{{'{}'}}", InstAddress, TableInfo->Name ?: "UND"); } BlockInstructionsLength += DecodedInfo->InstSize; TotalInstructionsLength += DecodedInfo->InstSize; ++TotalInstructions; // Walk InstForceTSOIt forward past the handled instruction InstForceTSOIt = std::find_if(InstForceTSOIt, ForceTSOInstructions.end(), [&](auto Val) { return Val >= Block.Entry + BlockInstructionsLength; }); } } else { // Invalid instruction if (!BlockInstructionsLength) { // SMC can modify block contents and patch invalid instructions to valid ones inline. // End blocks upon encountering them and only emit an invalid opcode exception if there are no prior instructions in the block (that could have modified it to be valid). if (TableInfo) { LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP); } if (Block.BlockStatus == Frontend::Decoder::DecodedBlockStatus::INVALID_INST || Block.BlockStatus == Frontend::Decoder::DecodedBlockStatus::BAD_RELOCATION) { Thread->OpDispatcher->InvalidOp(DecodedInfo); } else { Thread->OpDispatcher->NoExecOp(DecodedInfo); } } HadInvalidInst = true; } const bool NeedsBlockEnd = (HadDispatchError && TotalInstructions > 0) || (Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock) || HadInvalidInst; // If we had a dispatch error then leave early if (HadDispatchError && TotalInstructions == 0) { // Couldn't handle any instruction in op dispatcher Thread->OpDispatcher->DelayedDisownBuffer(); return {std::nullopt, 0, 0, 0, 0}; } if (NeedsBlockEnd) { // We had some instructions. Early exit Thread->OpDispatcher->ExitFunction( Thread->OpDispatcher->_InlineEntrypointOffset(GPRSize, Block.Entry + BlockInstructionsLength - GuestRIP)); break; } if (Thread->OpDispatcher->FinishOp(DecodedInfo->PC + DecodedInfo->InstSize, i + 1 == InstsInBlock)) { break; } } } #ifdef ZYDIS_DISASSEMBLER if (FEXCore::Config::Get_X86DISASSEMBLE()) { LogMan::Msg::IFmt("Guest x86 End"); } #endif Thread->OpDispatcher->Finalize(); Thread->FrontendDecoder->DelayedDisownBuffer(); } IR::IREmitter* IREmitter = Thread->OpDispatcher.get(); auto ShouldDump = Thread->OpDispatcher->ShouldDumpIR(); // Debug if (ShouldDump) { IRDumper(Thread, IREmitter, GuestRIP); } // Run the passmanager over the IR from the dispatcher Thread->PassManager->Run(IREmitter); // Debug if (ShouldDump) { IRDumper(Thread, IREmitter, GuestRIP); } return { .IRView = IREmitter->ViewIR(), .TotalInstructions = TotalInstructions, .TotalInstructionsLength = TotalInstructionsLength, .StartAddr = Thread->FrontendDecoder->DecodedMinAddress, .Length = Thread->FrontendDecoder->DecodedMaxAddress - Thread->FrontendDecoder->DecodedMinAddress, .NeedsAddGuestCodeRanges = !HasCustomIR, }; } ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalThreadState* Thread, uint64_t GuestRIP, uint64_t MaxInst) { if (SourcecodeResolver && Config.GDBSymbols()) { auto MappedSection = SyscallHandler->LookupExecutableFileSection(Thread, GuestRIP); if (MappedSection) { MappedSection->FileInfo.SourcecodeMap = SourcecodeResolver->GenerateMap(MappedSection->FileInfo.Filename, CodeMap::GetBaseFilename(MappedSection->FileInfo, false)); } } // Generate IR + Meta Info auto [IRView, TotalInstructions, TotalInstructionsLength, StartAddr, Length, NeedsAddGuestCodeRanges] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst); if (!IRView) { // OpDispatcher IR already released in this case. return {{}, nullptr, 0, 0, false}; } // Attempt to get the CPU backend to compile this code // Re-check if another thread raced us in compiling this block. // We could lock CodeBufferWriteMutex earlier to prevent this from happening, // but this would increase lock contention. Redundant frontend runs aren't // as expensive and are easily reverted. if (MaxInst != 1) { if (auto Block = Thread->LookupCache->FindBlock(Thread, GuestRIP)) { // Raced to compile, release the OpDispatcher IR. Thread->OpDispatcher->DelayedDisownBuffer(); return {.CompiledCode = {.BlockBegin = reinterpret_cast(Block), .EntryPoints = {{GuestRIP, reinterpret_cast(Block)}}}, .DebugData = nullptr, .StartAddr = 0, .Length = 0, .NeedsAddGuestCodeRanges = false}; } } auto DebugData = fextl::make_unique(); // If the trap flag is set we generate single instruction blocks that each check to generate a single step exception. bool TFSet = Thread->CurrentFrame->State.flags[X86State::RFLAG_TF_RAW_LOC]; auto CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, Length, TotalInstructions == 1, &*IRView, DebugData.get(), TFSet); // Release the IR Thread->OpDispatcher->DelayedDisownBuffer(); return { .CompiledCode = std::move(CompiledCode), .DebugData = std::move(DebugData), .StartAddr = StartAddr, .Length = Length, .NeedsAddGuestCodeRanges = NeedsAddGuestCodeRanges, }; } uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst) { auto Thread = Frame->Thread; FEXCORE_PROFILE_SCOPED("CompileBlock"); FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedJITTime); static_cast(Thread->CTX)->SyscallHandler->PreCompile(); // Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled auto lk = GuardSignalDeferringSection(CodeInvalidationMutex, Thread); // Is the code in the cache? // The backends only check L1 and L2, not L3 if (auto HostCode = Thread->LookupCache->FindBlock(Thread, GuestRIP)) { return HostCode; } // Accumulate a JIT count now, as even if another thread raced us, it should count as a compile. FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedJITCount, 1); auto [CompiledCode, DebugData, StartAddr, Length, NeedsAddGuestCodeRanges] = CompileCode(Thread, GuestRIP, MaxInst); auto CodePtr = CompiledCode.EntryPoints[GuestRIP]; if (CodePtr == nullptr) { return 0; } else if (!DebugData) { // DebugData wasn't populated, indicating another thread raced us for compiling this block return reinterpret_cast(CodePtr); } // The core managed to compile the code. if (Config.BlockJITNaming()) { auto FragmentBasePtr = CompiledCode.BlockBegin; auto GuestRIPLookup = SyscallHandler->LookupExecutableFileSection(Thread, GuestRIP); if (DebugData->Subblocks.size()) { for (auto& Subblock : DebugData->Subblocks) { auto BlockBasePtr = FragmentBasePtr + Subblock.HostCodeOffset; if (GuestRIPLookup) { Symbols.Register(Thread->SymbolBuffer.get(), BlockBasePtr, CompiledCode.Size, GuestRIPLookup->FileInfo.Filename, GuestRIP - GuestRIPLookup->FileStartVA); } else { Symbols.Register(Thread->SymbolBuffer.get(), BlockBasePtr, GuestRIP, Subblock.HostCodeSize); } } } else { if (GuestRIPLookup) { Symbols.Register(Thread->SymbolBuffer.get(), FragmentBasePtr, CompiledCode.Size, GuestRIPLookup->FileInfo.Filename, GuestRIP - GuestRIPLookup->FileStartVA); } else { Symbols.Register(Thread->SymbolBuffer.get(), FragmentBasePtr, GuestRIP, CompiledCode.Size); } } } if (Config.LibraryJITNaming() || Config.GDBSymbols()) { auto MappedSection = SyscallHandler->LookupExecutableFileSection(Thread, GuestRIP); if (MappedSection) { if (Config.LibraryJITNaming()) { Symbols.RegisterNamedRegion(Thread->SymbolBuffer.get(), CodePtr, DebugData->HostCodeSize, MappedSection->FileInfo.Filename); } if (Config.GDBSymbols()) { GDBJITRegister(MappedSection->FileInfo, MappedSection->FileStartVA, GuestRIP, (uintptr_t)CodePtr, *DebugData); } } } // Clear any relocations that might have been generated if (!CodeCache.IsGeneratingCache) { Thread->CPUBackend->ClearRelocations(); } fextl::vector CodePages; if (NeedsAddGuestCodeRanges) { // Track in the guest to host map all entrypoints for all pages the compiled block touches, if any page didn't previously // contain code, inform the frontend so it can setup SMC detection. auto BlockInfo = Thread->FrontendDecoder->GetDecodedBlockInfo(); CodePages.reserve(BlockInfo->CodePages.size()); CodePages.insert(CodePages.end(), BlockInfo->CodePages.begin(), BlockInfo->CodePages.end()); for (auto CodePage : BlockInfo->CodePages) { if (Thread->LookupCache->AddBlockExecutableRange(Thread, BlockInfo->EntryPoints, CodePage, FEXCore::Utils::FEX_PAGE_SIZE)) { SyscallHandler->MarkGuestExecutableRange(Thread, CodePage, FEXCore::Utils::FEX_PAGE_SIZE); } } } // Insert to lookup cache for (auto [GuestAddr, HostAddr] : CompiledCode.EntryPoints) { Thread->LookupCache->AddBlockMapping(Thread, GuestAddr, CodePages, HostAddr); } if (CodeMapWriter) { auto Region = SyscallHandler->LookupExecutableFileSection(Thread, GuestRIP); if (Region && Region->FileStartVA != 0) { CodeMapWriter->AppendBlock(*Region, GuestRIP); } } return (uintptr_t)CodePtr; } uintptr_t ContextImpl::CompileSingleStep(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP) { FEXCORE_PROFILE_SCOPED("CompileSingleStep"); auto Thread = Frame->Thread; static_cast(Thread->CTX)->SyscallHandler->PreCompile(); // Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled auto lk = GuardSignalDeferringSection(CodeInvalidationMutex, Thread); auto [CompiledCode, DebugData, StartAddr, Length, _] = CompileCode(Thread, GuestRIP, 1); auto CodePtr = CompiledCode.EntryPoints[GuestRIP]; if (CodePtr == nullptr) { return 0; } // Clear any relocations that might have been generated Thread->CPUBackend->ClearRelocations(); return (uintptr_t)CodePtr; } void ContextImpl::InvalidateCodeBuffersCodeRange(uint64_t Start, uint64_t Length) { FEXCORE_PROFILE_SCOPED("InvalidateCodeBuffersCodeRange"); LOGMAN_THROW_A_FMT(CodeInvalidationMutex.try_lock() == false, "CodeInvalidationMutex needs to be unique_locked here"); std::scoped_lock lk {CodeBufferListLock}; auto it = CodeBufferList.begin(); while (it != CodeBufferList.end()) { if (auto Strong = it->lock()) { Strong->LookupCache->InvalidateRange(Start, Length); it++; } else { it = CodeBufferList.erase(it); } } } void ContextImpl::InvalidateThreadCachedCodeRange(FEXCore::Core::InternalThreadState* Thread, uint64_t Start, uint64_t Length) { LOGMAN_THROW_A_FMT(CodeInvalidationMutex.try_lock() == false, "CodeInvalidationMutex needs to be unique_locked here"); // Ensures now-modified mappings aren't cached as being in their previous non-executable state. // Accessing FrontendDecoder is safe as the thread's code invalidation mutex must be locked here. Thread->FrontendDecoder->ResetExecutableRangeCache(); if (Thread->LookupCache->InvalidateCacheRange(Start, Length)) { FEXCORE_PROFILE_SCOPED("InvalidateCallRet"); // This may cause access violations in the thread on Windows as zeroing is not atomic, this is handled by the frontend Allocator::VirtualDontNeed(Thread->CallRetStackBase, FEXCore::Core::InternalThreadState::CALLRET_STACK_SIZE); } } void ContextImpl::ThreadRemoveCodeEntryFromJit(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP) { static_cast(Frame->Thread->CTX)->SyscallHandler->InvalidateGuestCodeRange(Frame->Thread, GuestRIP, 1); } std::optional ContextImpl::AddCustomIREntrypoint(uintptr_t Entrypoint, CustomIREntrypointHandler Handler, void* Creator, void* Data) { LOGMAN_THROW_A_FMT(Config.Is64BitMode || !(Entrypoint >> 32), "64-bit Entrypoint in 32-bit mode {:x}", Entrypoint); std::unique_lock lk(CustomIRMutex); auto InsertedIterator = CustomIRHandlers.emplace(Entrypoint, CustomIRHandlerEntry {Handler, Creator, Data}); HasCustomIRHandlers = true; if (!InsertedIterator.second) { const auto& [fn, Creator, Data] = InsertedIterator.first->second; return CustomIRResult(Creator, Data); } return std::nullopt; } void ContextImpl::AddThunkTrampolineIRHandler(uintptr_t Entrypoint, uintptr_t GuestThunkEntrypoint) { LOGMAN_THROW_A_FMT(Entrypoint, "Tried to link null pointer address to guest function"); LOGMAN_THROW_A_FMT(GuestThunkEntrypoint, "Tried to link address to null pointer guest function"); if (!Config.Is64BitMode) { LOGMAN_THROW_A_FMT((Entrypoint >> 32) == 0, "Tried to link 64-bit address in 32-bit mode"); LOGMAN_THROW_A_FMT((GuestThunkEntrypoint >> 32) == 0, "Tried to link 64-bit address in 32-bit mode"); } LogMan::Msg::DFmt("Thunks: Adding guest trampoline from address {:#x} to guest function {:#x}", Entrypoint, GuestThunkEntrypoint); auto Result = AddCustomIREntrypoint( Entrypoint, [this, GuestThunkEntrypoint](uintptr_t Entrypoint, FEXCore::IR::IREmitter* emit) { auto IRHeader = emit->_IRHeader(emit->Invalid(), Entrypoint, 0, 0, 0, 0); auto Block = emit->CreateCodeNode(true, 0); IRHeader.first->Blocks = emit->WrapNode(Block); emit->SetCurrentCodeBlock(Block); const auto GPRSize = this->Config.Is64BitMode ? IR::OpSize::i64Bit : IR::OpSize::i32Bit; // Thunk entry-points don't get cached, don't need to be padded. if (GPRSize == IR::OpSize::i64Bit) { IR::Ref R = emit->_StoreRegister(emit->Constant(Entrypoint), GPRSize); R->Reg = IR::PhysicalRegister(IR::RegClass::GPRFixed, X86State::REG_R11).Raw; } else { emit->_StoreContextFPR(GPRSize, emit->_VCastFromGPR(IR::OpSize::i64Bit, IR::OpSize::i64Bit, emit->Constant(Entrypoint)), offsetof(Core::CPUState, mm[0][0])); } emit->_ExitFunction(IR::OpSize::i64Bit, emit->Constant(GuestThunkEntrypoint), IR::BranchHint::None, emit->Invalid(), emit->Invalid()); }, ThunkHandler, (void*)GuestThunkEntrypoint); if (Result.has_value()) { if (Result->Creator != ThunkHandler) { ERROR_AND_DIE_FMT("Input address for AddThunkTrampoline is already linked by another module"); } if (Result->Data != (void*)GuestThunkEntrypoint) { // NOTE: This may happen in Vulkan thunks if the Vulkan driver resolves two different symbols // to the same function (e.g. vkGetPhysicalDeviceFeatures2/vkGetPhysicalDeviceFeatures2KHR) LogMan::Msg::EFmt("Input address for AddThunkTrampoline is already linked elsewhere"); } } } void ContextImpl::AddForceTSOInformation(const IntervalList& ValidRanges, fextl::set&& Instructions) { LogMan::Throw::AFmt(CodeInvalidationMutex.try_lock() == false, "CodeInvalidationMutex needs to be unique_locked here"); ForceTSOValidRanges.Insert(ValidRanges); ForceTSOInstructions.merge(std::move(Instructions)); } void ContextImpl::RemoveForceTSOInformation(uint64_t Address, uint64_t Size) { LogMan::Throw::AFmt(CodeInvalidationMutex.try_lock() == false, "CodeInvalidationMutex needs to be unique_locked here"); ForceTSOValidRanges.Remove({Address, Address + Size}); ForceTSOInstructions.erase(ForceTSOInstructions.lower_bound(Address), ForceTSOInstructions.upper_bound(Address + Size)); } void ContextImpl::MarkMonoBackpatcherBlock(uint64_t BlockEntry) { MonoBackpatcherBlock.store(BlockEntry, std::memory_order_relaxed); } void ContextImpl::RemoveCustomIREntrypoint(FEXCore::Core::InternalThreadState* Thread, uintptr_t Entrypoint) { LOGMAN_THROW_A_FMT(Config.Is64BitMode || !(Entrypoint >> 32), "64-bit Entrypoint in 32-bit mode {:x}", Entrypoint); std::scoped_lock lk(CustomIRMutex); CustomIRHandlers.erase(Entrypoint); HasCustomIRHandlers = !CustomIRHandlers.empty(); SyscallHandler->InvalidateGuestCodeRange(Thread, Entrypoint, 1); } void ContextImpl::MonoBackpatcherWrite(FEXCore::Core::CpuStateFrame* Frame, uint8_t Size, uint64_t Address, uint64_t Value) { auto Thread = Frame->Thread; auto CTX = static_cast(Thread->CTX); { auto lk = GuardSignalDeferringSection(CTX->CodeInvalidationMutex, Thread); if (Size == 8) { *reinterpret_cast(Address) = Value; } else if (Size == 4) { *reinterpret_cast(Address) = Value; } else { ERROR_AND_DIE_FMT("Unexpected write size for backpatcher: {}", Size); } } CTX->SyscallHandler->InvalidateGuestCodeRange(Thread, Address, Size); } void ContextImpl::ConfigureAOTGen(FEXCore::Core::InternalThreadState* Thread, fextl::set* ExternalBranches, uint64_t SectionMaxAddress) { Thread->FrontendDecoder->SetExternalBranches(ExternalBranches); } } // namespace FEXCore::Context ================================================ FILE: FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp ================================================ // SPDX-License-Identifier: MIT #include "Common/VectorRegType.h" #include "Interface/Context/Context.h" #include "Interface/Core/CPUBackend.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/LookupCache.h" #include "Utils/MemberFunctionToPointer.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef VIXL_SIMULATOR #include #endif #include #include #include namespace FEXCore::CPU { static void SleepThread(FEXCore::Context::ContextImpl* CTX, FEXCore::Core::CpuStateFrame* Frame) { CTX->SyscallHandler->SleepThread(CTX, Frame); } constexpr size_t MAX_DISPATCHER_CODE_SIZE = FEXCore::Utils::FEX_PAGE_SIZE * 4; Dispatcher::Dispatcher(FEXCore::Context::ContextImpl* ctx) : Arm64Emitter(ctx, FEXCore::Allocator::VirtualAlloc(MAX_DISPATCHER_CODE_SIZE, true), MAX_DISPATCHER_CODE_SIZE) , CTX {ctx} { EmitDispatcher(); FEXCore::Allocator::VirtualName("FEXMem_Misc", reinterpret_cast(GetBufferBase()), MAX_DISPATCHER_CODE_SIZE); } Dispatcher::~Dispatcher() { auto BufferSize = GetBufferSize(); if (BufferSize) { FEXCore::Allocator::VirtualFree(GetBufferBase(), BufferSize); } } void Dispatcher::EmitDispatcher() { // Don't modify TMP3 since it contains our RIP once the block doesn't exist auto RipReg = TMP3; #ifdef VIXL_DISASSEMBLER const auto DisasmBegin = GetCursorAddress(); #endif DispatchPtr = GetCursorAddress(); // while (true) { // Ptr = FindBlock(RIP) // if (!Ptr) // Ptr = CTX->CompileBlock(RIP); // // Ptr(); // } ARMEmitter::ForwardLabel l_CTX; ARMEmitter::ForwardLabel l_Sleep; ARMEmitter::ForwardLabel l_CompileBlock; ARMEmitter::ForwardLabel l_CompileSingleStep; // Push all the register we need to save PushCalleeSavedRegisters(); // Push our memory base to the correct register // Move our thread pointer to the correct register // This is passed in to parameter 0 (x0) mov(STATE, ARMEmitter::XReg::x0); // Save this stack pointer so we can cleanly shutdown the emulation with a long jump // regardless of where we were in the stack add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, ARMEmitter::Reg::rsp, 0); str(ARMEmitter::XReg::x0, STATE_PTR(CpuStateFrame, ReturningStackLocation)); ARMEmitter::ForwardLabel CompileSingleStep; AbsoluteLoopTopAddressFillSRA = GetCursorAddress(); FillStaticRegs(); ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip)); (void)cbnz(ARMEmitter::Size::i32Bit, ENTRY_FILL_SRA_SINGLE_INST_REG, &CompileSingleStep); ARMEmitter::BiDirectionalLabel LoopTop {}; #ifdef ARCHITECTURE_arm64ec (void)b(&LoopTop); AbsoluteLoopTopAddressEnterECFillSRA = GetCursorAddress(); ldr(STATE, EC_ENTRY_CPUAREA_REG, CPU_AREA_EMULATOR_DATA_OFFSET); FillStaticRegs(); ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip)); // Force a single instruction block if ENTRY_FILL_SRA_SINGLE_INST_REG is nonzero entering the JIT, used for inline SMC handling. (void)cbnz(ARMEmitter::Size::i32Bit, ENTRY_FILL_SRA_SINGLE_INST_REG, &CompileSingleStep); // Enter JIT (void)b(&LoopTop); AbsoluteLoopTopAddressEnterEC = GetCursorAddress(); // Load ThreadState and write the target PC there ldr(STATE, EC_ENTRY_CPUAREA_REG, CPU_AREA_EMULATOR_DATA_OFFSET); str(EC_CALL_CHECKER_PC_REG, STATE_PTR(CpuStateFrame, State.rip)); // Swap stacks to the emulator stack ldr(TMP1, EC_ENTRY_CPUAREA_REG, CPU_AREA_EMULATOR_STACK_BASE_OFFSET); add(ARMEmitter::Size::i64Bit, StaticRegisters[X86State::REG_RSP], ARMEmitter::Reg::rsp, 0); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, TMP1, 0); ldr(REG_CALLRET_SP, STATE_PTR(CpuStateFrame, State.callret_sp)); FillSpecialRegs(TMP1, TMP2, false, true); // As ARM64EC uses this as an entrypoint for both guest calls and host returns, opportunistically try to return // using the call-ret stack to avoid unbalancing it. ldp(TMP1, TMP2, REG_CALLRET_SP); // EC_CALL_CHECKER_PC_REG is REG_PF which isn't touched by any of the above sub(ARMEmitter::Size::i64Bit, TMP1, EC_CALL_CHECKER_PC_REG, TMP1); (void)cbnz(ARMEmitter::Size::i64Bit, TMP1, &LoopTop); // If the entry at the TOS is for the target address, pop it and return to the JIT code add(ARMEmitter::Size::i64Bit, REG_CALLRET_SP, REG_CALLRET_SP, 0x10); ret(TMP2); // Enter JIT #endif // We want to ensure that we are 16 byte aligned at the top of this loop Align16B(); (void)Bind(&LoopTop); AbsoluteLoopTopAddress = GetCursorAddress(); // Load in our RIP ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip)); #ifdef ARCHITECTURE_arm64ec // Clobbers TMP1/2 // Check the EC code bitmap incase we need to exit the JIT to call into native code. ARMEmitter::ForwardLabel l_NotECCode; ldr(TMP1, ARMEmitter::XReg::x18, TEB_PEB_OFFSET); ldr(TMP1, TMP1, PEB_EC_CODE_BITMAP_OFFSET); lsr(ARMEmitter::Size::i64Bit, TMP2, RipReg, 15); and_(ARMEmitter::Size::i64Bit, TMP2, TMP2, 0x1fffffffffff8); ldr(TMP1, TMP1, TMP2, ARMEmitter::ExtendedType::LSL_64, 0); lsr(ARMEmitter::Size::i64Bit, TMP2, RipReg, 12); lsrv(ARMEmitter::Size::i64Bit, TMP1, TMP1, TMP2); (void)tbz(TMP1, 0, &l_NotECCode); str(REG_CALLRET_SP, STATE_PTR(CpuStateFrame, State.callret_sp)); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, StaticRegisters[X86State::REG_RSP], 0); mov(EC_CALL_CHECKER_PC_REG, RipReg); ldr(TMP2, STATE_PTR(CpuStateFrame, Pointers.ExitFunctionEC)); br(TMP2); (void)Bind(&l_NotECCode); #endif ldrb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC])); (void)cbnz(ARMEmitter::Size::i32Bit, TMP1, &CompileSingleStep); ARMEmitter::ForwardLabel NoBlock; if (DisableL2Cache()) { (void)b(&NoBlock); } else { // This is the block cache lookup routine // It matches what is going on it LookupCache.h::FindBlock ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.L2Pointer)); // Mask the address by the virtual address size so we can check for aliases uint64_t VirtualMemorySize = CTX->Config.VirtualMemSize; if (std::popcount(VirtualMemorySize) == 1) { and_(ARMEmitter::Size::i64Bit, TMP4, RipReg.R(), VirtualMemorySize - 1); } else { LoadConstant(ARMEmitter::Size::i64Bit, TMP4, VirtualMemorySize); and_(ARMEmitter::Size::i64Bit, TMP4, RipReg.R(), TMP4); } { // Offset the address and add to our page pointer lsr(ARMEmitter::Size::i64Bit, TMP2, TMP4, 12); // Load the pointer from the offset ldr(TMP1, TMP1, TMP2, ARMEmitter::ExtendedType::LSL_64, 3); // If page pointer is zero then we have no block (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &NoBlock); // Steal the page offset and_(ARMEmitter::Size::i64Bit, TMP2, TMP4, 0x0FFF); // Shift the offset by the size of the block cache entry add(TMP1, TMP1, TMP2, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(sizeof(LookupCache::LookupCacheEntry))); // The the full LookupCacheEntry with a single LDP. // Check the guest address first to ensure it maps to the address we are currently at. // This fixes aliasing problems ldp(TMP4, TMP2, TMP1, 0); // If the guest address doesn't match, Compile the block. sub(TMP2, TMP2, RipReg); (void)cbnz(ARMEmitter::Size::i64Bit, TMP2, &NoBlock); // Check the host address to see if it matches, else compile the block. (void)cbz(ARMEmitter::Size::i64Bit, TMP4, &NoBlock); // If we've made it here then we have a real compiled block { // update L1 cache ldp(TMP1, TMP2, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.L1Pointer)); // Calculate (tmp1 + ((ripreg & L1_ENTRIES_MASK) << 4)) for the address // L1Mask is pre-shifted. and_(ARMEmitter::Size::i64Bit, TMP2, TMP2, RipReg.R(), ARMEmitter::ShiftType::LSL, FEXCore::ilog2(sizeof(LookupCache::LookupCacheEntry))); add(TMP1, TMP1, TMP2); stp(TMP4, RipReg, TMP1); // Jump to the block br(TMP4); } } } { ThreadStopHandlerAddressSpillSRA = GetCursorAddress(); SpillStaticRegs(TMP1); ThreadStopHandlerAddress = GetCursorAddress(); PopCalleeSavedRegisters(); // Return from the function // LR is set to the correct return location now ret(); } // Clobbers TMP1/2 auto EmitSignalGuardedRegion = [&](auto Body) { #ifndef _WIN32 ldr(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 1); str(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); #endif #ifdef ARCHITECTURE_arm64ec ldr(TMP2, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET); LoadConstant(ARMEmitter::Size::i32Bit, TMP1, 1); strb(TMP1.W(), TMP2, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET); #endif Body(); #ifdef ARCHITECTURE_arm64ec ldr(TMP2, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET); strb(ARMEmitter::WReg::zr, TMP2, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET); #endif #ifndef _WIN32 ldr(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 1); str(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); // Trigger segfault if any deferred signals are pending strb(ARMEmitter::XReg::zr, STATE, offsetof(FEXCore::Core::InternalThreadState, InterruptFaultPage) - offsetof(FEXCore::Core::InternalThreadState, BaseFrameState)); #endif }; { ExitFunctionLinkerAddress = GetCursorAddress(); EmitSignalGuardedRegion([&]() { SpillStaticRegs(TMP1); mov(ARMEmitter::XReg::x0, STATE); mov(ARMEmitter::XReg::x1, ARMEmitter::XReg::lr); ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.ExitFunctionLink)); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } else { blr(ARMEmitter::Reg::r2); } if (!TMP_ABIARGS) { mov(TMP1, ARMEmitter::XReg::x0); } FillStaticRegs(); }); br(TMP1); } // Need to create the block { (void)Bind(&NoBlock); EmitSignalGuardedRegion([&]() { SpillStaticRegs(TMP1); if (!TMP_ABIARGS) { mov(ARMEmitter::XReg::x2, RipReg); } ldr(ARMEmitter::XReg::x0, &l_CTX); mov(ARMEmitter::XReg::x1, STATE); // x2 contains guest RIP mov(ARMEmitter::XReg::x3, 0); ldr(ARMEmitter::XReg::x4, &l_CompileBlock); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r4); } else { blr(ARMEmitter::Reg::r4); // { CTX, Frame, RIP, MaxInst } } // Result is now in x0 if (!TMP_ABIARGS) { mov(TMP1, ARMEmitter::XReg::x0); } FillStaticRegs(); }); // Jump to the compiled block br(TMP1); } { (void)Bind(&CompileSingleStep); EmitSignalGuardedRegion([&]() { SpillStaticRegs(TMP1); if (!TMP_ABIARGS) { mov(ARMEmitter::XReg::x2, RipReg); } ldr(ARMEmitter::XReg::x0, &l_CTX); mov(ARMEmitter::XReg::x1, STATE); // x2 contains guest RIP ldr(ARMEmitter::XReg::x4, &l_CompileSingleStep); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r4); } else { blr(ARMEmitter::Reg::r4); // { CTX, Frame, RIP } } // Result is now in x0 if (!TMP_ABIARGS) { mov(TMP1, ARMEmitter::XReg::x0); } FillStaticRegs(); }); // Jump to the compiled block br(TMP1); } { SignalHandlerReturnAddress = GetCursorAddress(); // Now to get back to our old location we need to do a fault dance // We can't use SIGTRAP here since gdb catches it and never gives it to the application! hlt(0); } { SignalHandlerReturnAddressRT = GetCursorAddress(); // Now to get back to our old location we need to do a fault dance // We can't use SIGTRAP here since gdb catches it and never gives it to the application! hlt(0); } { // Guest SIGILL handler // Needs to be distinct from the SignalHandlerReturnAddress GuestSignal_SIGILL = GetCursorAddress(); SpillStaticRegs(TMP1); hlt(0); } { // Guest SIGTRAP handler // Needs to be distinct from the SignalHandlerReturnAddress GuestSignal_SIGTRAP = GetCursorAddress(); SpillStaticRegs(TMP1); brk(0); } { // Guest Overflow handler // Needs to be distinct from the SignalHandlerReturnAddress GuestSignal_SIGSEGV = GetCursorAddress(); SpillStaticRegs(TMP1); // hlt/udf = SIGILL // brk = SIGTRAP // ??? = SIGSEGV // Force a SIGSEGV by loading zero if (CTX->ExitOnHLTEnabled()) { ldr(ARMEmitter::XReg::x0, STATE_PTR(CpuStateFrame, ReturningStackLocation)); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::r0, 0); PopCalleeSavedRegisters(); ret(); } else { LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, 0); ldr(ARMEmitter::XReg::x1, ARMEmitter::Reg::r1); } } { ThreadPauseHandlerAddressSpillSRA = GetCursorAddress(); SpillStaticRegs(TMP1); ThreadPauseHandlerAddress = GetCursorAddress(); // We are pausing, this means the frontend should be waiting for this thread to idle // We will have faulted and jumped to this location at this point // Call our sleep handler ldr(ARMEmitter::XReg::x0, &l_CTX); mov(ARMEmitter::XReg::x1, STATE); ldr(ARMEmitter::XReg::x2, &l_Sleep); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } else { blr(ARMEmitter::Reg::r2); } PauseReturnInstruction = GetCursorAddress(); // Fault to start running again hlt(0); } { // The expectation here is that a thunked function needs to call back in to the JIT in a reentrant safe way // To do this safely we need to do some state tracking and register saving // // eg: // JIT Call-> // Thunk-> // Thunk callback-> // // The thunk callback needs to execute JIT code and when it returns, it needs to safely return to the thunk rather than JIT space // This is handled by pushing a return address trampoline to the stack so when the guest address returns it hits our custom thunk return // - This will safely return us to the thunk // // On return to the thunk, the thunk can get whatever its return value is from the thread context depending on ABI handling on its end // When the thunk itself returns, it'll do its regular return logic there // void ReentrantCallback(FEXCore::Core::InternalThreadState *Thread, uint64_t RIP); CallbackPtr = GetCursorAddress(); // We expect the thunk to have previously pushed the registers it was using PushCalleeSavedRegisters(); // First thing we need to move the thread state pointer back in to our register mov(STATE, ARMEmitter::XReg::x0); // Make sure to adjust the refcounter so we don't clear the cache now ldr(ARMEmitter::WReg::w2, STATE_PTR(CpuStateFrame, SignalHandlerRefCounter)); add(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r2, 1); str(ARMEmitter::WReg::w2, STATE_PTR(CpuStateFrame, SignalHandlerRefCounter)); // Now push the callback return trampoline to the guest stack // Guest will be misaligned because calling a thunk won't correct the guest's stack once we call the callback from the host ldr(ARMEmitter::XReg::x0, STATE_PTR(CpuStateFrame, Pointers.ThunkCallbackRet)); ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, State.gregs[X86State::REG_RSP])); sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r2, CTX->Config.Is64BitMode ? 16 : 12); str(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, State.gregs[X86State::REG_RSP])); // Store the trampoline to the guest stack // Guest stack is now correctly misaligned after a regular call instruction str(ARMEmitter::XReg::x0, ARMEmitter::Reg::r2, 0); // Store RIP to the context state str(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, State.rip)); // load static regs FillStaticRegs(); stp(ARMEmitter::XReg::zr, ARMEmitter::XReg::zr, REG_CALLRET_SP, -0x10); // Now go back to the regular dispatcher loop (void)b(&LoopTop); } auto EmitLongALUOpHandler = [&](auto R, auto Offset) { auto Address = GetCursorAddress(); PushDynamicRegs(TMP4); SpillStaticRegs(TMP4); if (!TMP_ABIARGS) { mov(ARMEmitter::XReg::x0, TMP1); mov(ARMEmitter::XReg::x1, TMP2); mov(ARMEmitter::XReg::x2, TMP3); } ldr(ARMEmitter::XReg::x3, R, Offset); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall<__uint128_t, uint64_t, uint64_t, uint64_t>(ARMEmitter::Reg::r3); } else { blr(ARMEmitter::Reg::r3); } // Result is now in x0, x1 if (!TMP_ABIARGS) { mov(TMP1, ARMEmitter::XReg::x0); mov(TMP2, ARMEmitter::XReg::x1); } FillStaticRegs(); // Fix the stack and any values that were stepped on PopDynamicRegs(); // Go back to our code block ret(); return Address; }; LUDIVHandlerAddress = EmitLongALUOpHandler(STATE_PTR(CpuStateFrame, Pointers.LUDIV)); LDIVHandlerAddress = EmitLongALUOpHandler(STATE_PTR(CpuStateFrame, Pointers.LDIV)); EmitF64Sin(); EmitF64Cos(); EmitF64Tan(); // Interpreter fallbacks { constexpr static std::array ABIS {{ FABI_F80_I16_F32_PTR, FABI_F80_I16_F64_PTR, FABI_F80_I16_I16_PTR, FABI_F80_I16_I32_PTR, FABI_F32_I16_F80_PTR, FABI_F64_I16_F80_PTR, FABI_F64_F64_PTR, FABI_F64_F64_F64_PTR, FABI_I16_I16_F80_PTR, FABI_I32_I16_F80_PTR, FABI_I64_I16_F80_PTR, FABI_I64_I16_F80_F80_PTR, FABI_F80_I16_F80_PTR, FABI_F80_I16_F80_F80_PTR, FABI_F80x2_I16_F80_PTR, FABI_F64x2_F64_PTR, FABI_I32_I64_I64_V128_V128_I16, FABI_I32_V128_V128_I16, }}; for (auto ABI : ABIS) { ABIPointers[ABI] = GenerateABICall(ABI); } } (void)Bind(&l_CTX); dc64(reinterpret_cast(CTX)); (void)Bind(&l_Sleep); dc64(reinterpret_cast(SleepThread)); (void)Bind(&l_CompileBlock); FEXCore::Utils::MemberFunctionToPointerCast PMFCompileBlock(&FEXCore::Context::ContextImpl::CompileBlock); dc64(PMFCompileBlock.GetConvertedPointer()); (void)Bind(&l_CompileSingleStep); FEXCore::Utils::MemberFunctionToPointerCast PMFCompileSingleStep(&FEXCore::Context::ContextImpl::CompileSingleStep); dc64(PMFCompileSingleStep.GetConvertedPointer()); Start = reinterpret_cast(DispatchPtr); End = GetCursorAddress(); ClearICache(reinterpret_cast(DispatchPtr), End - reinterpret_cast(DispatchPtr)); if (CTX->Config.BlockJITNaming()) { fextl::string Name = fextl::fmt::format("Dispatch_{}", FHU::Syscalls::gettid()); CTX->Symbols.RegisterNamedRegion(reinterpret_cast(DispatchPtr), End - reinterpret_cast(DispatchPtr), Name); } if (CTX->Config.GlobalJITNaming()) { CTX->Symbols.RegisterJITSpace(reinterpret_cast(DispatchPtr), End - reinterpret_cast(DispatchPtr)); } #ifdef VIXL_DISASSEMBLER if (Disassemble() & FEXCore::Config::Disassemble::DISPATCHER) { const auto DisasmEnd = GetCursorAddress(); for (auto PCToDecode = DisasmBegin; PCToDecode < DisasmEnd; PCToDecode += 4) { DisasmDecoder->Decode(PCToDecode); auto Output = Disasm->GetOutput(); LogMan::Msg::IFmt("{}", Output); } } #endif } #ifdef VIXL_SIMULATOR void Dispatcher::ExecuteDispatch(FEXCore::Core::CpuStateFrame* Frame) { Simulator.WriteXRegister(0, reinterpret_cast(Frame)); Simulator.WriteXRegister(1, 0); Simulator.RunFrom(reinterpret_cast< const vixl::aarch64::Instruction*>(DispatchPtr)); } void Dispatcher::ExecuteJITCallback(FEXCore::Core::CpuStateFrame* Frame, uint64_t RIP) { Simulator.WriteXRegister(0, reinterpret_cast(Frame)); Simulator.WriteXRegister(1, RIP); Simulator.RunFrom(reinterpret_cast< const vixl::aarch64::Instruction*>(CallbackPtr)); } #endif void Dispatcher::EmitI32ToExtF80() { ARMEmitter::ForwardLabel ZeroCase; ARMEmitter::ForwardLabel Done; (void)cbz(ARMEmitter::Size::i32Bit, TMP2, &ZeroCase); lsr(ARMEmitter::Size::i32Bit, TMP4, TMP2, 31); tst(ARMEmitter::Size::i32Bit, TMP2, TMP2); neg(ARMEmitter::Size::i32Bit, TMP3, TMP2); csel(ARMEmitter::Size::i32Bit, TMP3, TMP3, TMP2, ARMEmitter::Condition::CC_MI); clz(ARMEmitter::Size::i32Bit, TMP1, TMP3); mov(ARMEmitter::Size::i32Bit, TMP2, 0x401E); sub(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP1); orr(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP4, ARMEmitter::ShiftType::LSL, 15); lslv(ARMEmitter::Size::i32Bit, TMP3, TMP3, TMP1); lsl(ARMEmitter::Size::i64Bit, TMP3, TMP3, 32); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&ZeroCase); movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); (void)Bind(&Done); } void Dispatcher::EmitI16ToExtF80() { sxth(ARMEmitter::Size::i32Bit, TMP2, TMP2); ARMEmitter::ForwardLabel ZeroCase; ARMEmitter::ForwardLabel Done; (void)cbz(ARMEmitter::Size::i32Bit, TMP2, &ZeroCase); lsr(ARMEmitter::Size::i32Bit, TMP4, TMP2, 31); tst(ARMEmitter::Size::i32Bit, TMP2, TMP2); neg(ARMEmitter::Size::i32Bit, TMP3, TMP2); csel(ARMEmitter::Size::i32Bit, TMP3, TMP3, TMP2, ARMEmitter::Condition::CC_MI); clz(ARMEmitter::Size::i32Bit, TMP1, TMP3); mov(ARMEmitter::Size::i32Bit, TMP2, 0x401E); sub(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP1); orr(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP4, ARMEmitter::ShiftType::LSL, 15); lslv(ARMEmitter::Size::i32Bit, TMP3, TMP3, TMP1); lsl(ARMEmitter::Size::i64Bit, TMP3, TMP3, 32); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&ZeroCase); movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); (void)Bind(&Done); } void Dispatcher::EmitF32ToExtF80() { ARMEmitter::ForwardLabel InfNaN; ARMEmitter::ForwardLabel ZeroDenormal; ARMEmitter::ForwardLabel Denormal; ARMEmitter::ForwardLabel NaN; ARMEmitter::ForwardLabel Done; ARMEmitter::BiDirectionalLabel NormalPath; ARMEmitter::ForwardLabel ZeroResult; fmov(ARMEmitter::Size::i32Bit, TMP1, VTMP1.S()); ubfx(ARMEmitter::Size::i32Bit, TMP2, TMP1, 23, 8); and_(ARMEmitter::Size::i32Bit, TMP3, TMP1, 0x007FFFFF); lsr(ARMEmitter::Size::i32Bit, TMP4, TMP1, 31); cmp(ARMEmitter::Size::i32Bit, TMP2, 0xFF); (void)b(ARMEmitter::Condition::CC_EQ, &InfNaN); (void)cbz(ARMEmitter::Size::i32Bit, TMP2, &ZeroDenormal); (void)Bind(&NormalPath); // Exponent bias adjustment, where bias is 0x3F80 LoadConstant(ARMEmitter::Size::i32Bit, TMP1, 0x3F80); add(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP1); orr(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP4, ARMEmitter::ShiftType::LSL, 15); // Set implicit bit and shift fraction to extF80 position LoadConstant(ARMEmitter::Size::i64Bit, TMP1, 0x00800000ULL); orr(ARMEmitter::Size::i64Bit, TMP3, TMP3, TMP1); lsl(ARMEmitter::Size::i64Bit, TMP3, TMP3, 40); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&ZeroDenormal); (void)cbz(ARMEmitter::Size::i32Bit, TMP3, &ZeroResult); (void)Bind(&Denormal); clz(ARMEmitter::Size::i32Bit, TMP1, TMP3); sub(ARMEmitter::Size::i32Bit, TMP1, TMP1, 8); mov(ARMEmitter::Size::i32Bit, TMP2, 1); sub(ARMEmitter::Size::i32Bit, TMP2, TMP2, TMP1); lslv(ARMEmitter::Size::i32Bit, TMP3, TMP3, TMP1); (void)b(&NormalPath); (void)Bind(&ZeroResult); lsl(ARMEmitter::Size::i32Bit, TMP2, TMP4, 15); movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&InfNaN); (void)cbnz(ARMEmitter::Size::i32Bit, TMP3, &NaN); lsl(ARMEmitter::Size::i32Bit, TMP2, TMP4, 15); orr(ARMEmitter::Size::i32Bit, TMP2, TMP2, 0x7FFF); LoadConstant(ARMEmitter::Size::i64Bit, TMP3, 0x8000000000000000ULL); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&NaN); lsl(ARMEmitter::Size::i32Bit, TMP2, TMP4, 15); orr(ARMEmitter::Size::i32Bit, TMP2, TMP2, 0x7FFF); lsl(ARMEmitter::Size::i64Bit, TMP3, TMP3, 40); LoadConstant(ARMEmitter::Size::i64Bit, TMP1, 0xC000000000000000ULL); orr(ARMEmitter::Size::i64Bit, TMP3, TMP3, TMP1); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)Bind(&Done); } void Dispatcher::EmitF64ToExtF80() { ARMEmitter::ForwardLabel InfNaN; ARMEmitter::ForwardLabel ZeroDenormal; ARMEmitter::ForwardLabel Denormal; ARMEmitter::ForwardLabel NaN; ARMEmitter::ForwardLabel Done; ARMEmitter::BiDirectionalLabel NormalPath; ARMEmitter::ForwardLabel ZeroResult; fmov(ARMEmitter::Size::i64Bit, TMP1, VTMP1.D()); lsr(ARMEmitter::Size::i64Bit, TMP4, TMP1, 63); ubfx(ARMEmitter::Size::i64Bit, TMP2, TMP1, 52, 11); LoadConstant(ARMEmitter::Size::i64Bit, TMP3, 0x000FFFFFFFFFFFFFULL); and_(ARMEmitter::Size::i64Bit, TMP3, TMP1, TMP3); cmp(ARMEmitter::Size::i64Bit, TMP2, 0x7FF); (void)b(ARMEmitter::Condition::CC_EQ, &InfNaN); (void)cbz(ARMEmitter::Size::i64Bit, TMP2, &ZeroDenormal); (void)Bind(&NormalPath); // Exponent bias adjustment where bias difference is 0x3C00 add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 0x3000); add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 0xC00); orr(ARMEmitter::Size::i64Bit, TMP2, TMP2, TMP4, ARMEmitter::ShiftType::LSL, 15); LoadConstant(ARMEmitter::Size::i64Bit, TMP1, 0x0010000000000000ULL); orr(ARMEmitter::Size::i64Bit, TMP3, TMP3, TMP1); lsl(ARMEmitter::Size::i64Bit, TMP3, TMP3, 11); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&ZeroDenormal); (void)cbz(ARMEmitter::Size::i64Bit, TMP3, &ZeroResult); (void)Bind(&Denormal); clz(ARMEmitter::Size::i64Bit, TMP1, TMP3); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 11); mov(ARMEmitter::Size::i64Bit, TMP2, 1); sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, TMP1); lslv(ARMEmitter::Size::i64Bit, TMP3, TMP3, TMP1); (void)b(&NormalPath); (void)Bind(&ZeroResult); lsl(ARMEmitter::Size::i64Bit, TMP2, TMP4, 15); movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&InfNaN); (void)cbnz(ARMEmitter::Size::i64Bit, TMP3, &NaN); lsl(ARMEmitter::Size::i64Bit, TMP2, TMP4, 15); orr(ARMEmitter::Size::i64Bit, TMP2, TMP2, 0x7FFF); LoadConstant(ARMEmitter::Size::i64Bit, TMP3, 0x8000000000000000ULL); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)b(&Done); (void)Bind(&NaN); lsl(ARMEmitter::Size::i64Bit, TMP2, TMP4, 15); orr(ARMEmitter::Size::i64Bit, TMP2, TMP2, 0x7FFF); lsl(ARMEmitter::Size::i64Bit, TMP3, TMP3, 11); LoadConstant(ARMEmitter::Size::i64Bit, TMP1, 0xC000000000000000ULL); orr(ARMEmitter::Size::i64Bit, TMP3, TMP3, TMP1); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP3); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 4, TMP2); (void)Bind(&Done); } void Dispatcher::EmitF64Sin() { F64SinHandlerAddress = GetCursorAddress(); constexpr auto V2 = ARMEmitter::VReg::v2; constexpr auto V3 = ARMEmitter::VReg::v3; constexpr auto V4 = ARMEmitter::VReg::v4; constexpr auto V5 = ARMEmitter::VReg::v5; ARMEmitter::ForwardLabel Fallback, NonZero; ARMEmitter::ForwardLabel InvPiPi1Label, Pi23Label; ARMEmitter::ForwardLabel C0Label, C1Label, C2Label, C3Label, C4Label, C5Label, C6Label; ARMEmitter::ForwardLabel RangeLabel; // sin(+/-0) = +/-0 fmov(ARMEmitter::Size::i64Bit, TMP1, VTMP1.D()); lsl(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1); (void)cbnz(ARMEmitter::Size::i64Bit, TMP1, &NonZero); ret(); (void)Bind(&NonZero); // Save q2-q5. stp(ARMEmitter::QReg::q2, ARMEmitter::QReg::q3, ARMEmitter::Reg::rsp, -64); stp(ARMEmitter::QReg::q4, ARMEmitter::QReg::q5, ARMEmitter::Reg::rsp, 32); // save nzcv mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); // Range check: fall back for |x| >= 2^23, NaN, and inf. fabs(VTMP2.D(), VTMP1.D()); ldr(V2.D(), &RangeLabel); fcmp(VTMP2.D(), V2.D()); (void)b(ARMEmitter::Condition::CC_HS, &Fallback); // n = rint(x/pi). ldr(V2.Q(), &InvPiPi1Label); // q2 = {inv_pi, pi_1} fmul(VTMP2.D(), VTMP1.D(), V2.D()); frinta(VTMP2.D(), VTMP2.D()); // odd = (int(n) & 1) << 63. fcvtzs(ARMEmitter::Size::i64Bit, TMP1, VTMP2.D()); lsl(ARMEmitter::Size::i64Bit, TMP1, TMP1, 63); // r = x - n*pi (range reduction) via .2D lane-indexed FMLS. ldr(V3.Q(), &Pi23Label); // q3 = {pi_2, pi_3} fmov(V4.D(), VTMP1.D()); // r = x fmls(ARMEmitter::SubRegSize::i64Bit, V4.Q(), VTMP2.Q(), V2.Q(), 1); // r -= n * pi_1 fmls(ARMEmitter::SubRegSize::i64Bit, V4.Q(), VTMP2.Q(), V3.Q(), 0); // r -= n * pi_2 fmls(ARMEmitter::SubRegSize::i64Bit, V4.Q(), VTMP2.Q(), V3.Q(), 1); // r -= n * pi_3 // r^2, r^4. fmul(V5.D(), V4.D(), V4.D()); fmov(ARMEmitter::Size::i64Bit, TMP2, V4.D()); fmul(V3.D(), V5.D(), V5.D()); // Estrin polynomial: p = c0 + r2*c1 + r4*(c2 + r2*c3) + r8*(c4 + r2*c5 + r4*c6). // Level 1 (independent FMAs). ldr(VTMP1.D(), &C0Label); ldr(VTMP2.D(), &C1Label); fmadd(VTMP1.D(), V5.D(), VTMP2.D(), VTMP1.D()); // p01 = c0 + r2*c1 ldr(VTMP2.D(), &C2Label); ldr(V2.D(), &C3Label); fmadd(VTMP2.D(), V5.D(), V2.D(), VTMP2.D()); // p23 = c2 + r2*c3 ldr(V2.D(), &C4Label); ldr(V4.D(), &C5Label); fmadd(V2.D(), V5.D(), V4.D(), V2.D()); // p45 = c4 + r2*c5 // Level 2 (serial). ldr(V4.D(), &C6Label); fmadd(V2.D(), V3.D(), V4.D(), V2.D()); // p46 = p45 + r4*c6 fmadd(VTMP2.D(), V3.D(), V2.D(), VTMP2.D()); // p26 = p23 + r4*p46 fmadd(VTMP1.D(), V3.D(), VTMP2.D(), VTMP1.D()); // p06 = p01 + r4*p26 // y = r + r^3 * p06. fmov(ARMEmitter::Size::i64Bit, V4.D(), TMP2); fmul(V5.D(), V5.D(), V4.D()); fmadd(VTMP1.D(), V5.D(), VTMP1.D(), V4.D()); // result = y XOR odd. fmov(ARMEmitter::Size::i64Bit, TMP2, VTMP1.D()); eor(ARMEmitter::Size::i64Bit, TMP2, TMP2, TMP1); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP2); // restore nzcv ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); // Restore q2-q5 and return. ldp(ARMEmitter::QReg::q4, ARMEmitter::QReg::q5, ARMEmitter::Reg::rsp, 32); ldp(ARMEmitter::QReg::q2, ARMEmitter::QReg::q3, ARMEmitter::Reg::rsp, 64); ret(); // Fallback path. (void)Bind(&Fallback); ldp(ARMEmitter::QReg::q4, ARMEmitter::QReg::q5, ARMEmitter::Reg::rsp, 32); ldp(ARMEmitter::QReg::q2, ARMEmitter::QReg::q3, ARMEmitter::Reg::rsp, 64); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.FallbackHandlerPointers[FEXCore::Core::OPINDEX_F64SIN].ABIHandler)); ldr(TMP4, STATE_PTR(CpuStateFrame, Pointers.FallbackHandlerPointers[FEXCore::Core::OPINDEX_F64SIN].Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); ret(); // Constant pool. Align(16); (void)Bind(&InvPiPi1Label); dc64(0x3FD4'5F30'6DC9'C883ULL); // inv_pi dc64(0x4009'21FB'5444'2D18ULL); // pi_1 (void)Bind(&Pi23Label); dc64(0x3CA1'A626'3314'5C06ULL); // pi_2 dc64(0x395C'1CD1'2902'4E09ULL); // pi_3 (void)Bind(&C0Label); dc64(0xBFC5'5555'5555'547BULL); // c0 (void)Bind(&C1Label); dc64(0x3F81'1111'1110'8A4DULL); // c1 (void)Bind(&C2Label); dc64(0xBF2A'01A0'1993'6F27ULL); // c2 (void)Bind(&C3Label); dc64(0x3EC7'1DE3'7A97'D93EULL); // c3 (void)Bind(&C4Label); dc64(0xBE5A'E633'9199'87C6ULL); // c4 (void)Bind(&C5Label); dc64(0x3DE6'0E27'7AE0'7CECULL); // c5 (void)Bind(&C6Label); dc64(0xBD69'E954'0300'A100ULL); // c6 (void)Bind(&RangeLabel); dc64(0x4160'0000'0000'0000ULL); // 2^23 } void Dispatcher::EmitF64Cos() { F64CosHandlerAddress = GetCursorAddress(); constexpr auto Accum = ARMEmitter::VReg::v2; ARMEmitter::ForwardLabel Fallback; ARMEmitter::ForwardLabel RangeLabel, InvPiLabel; ARMEmitter::ForwardLabel Pi1Label, Pi2Label, Pi3Label; ARMEmitter::ForwardLabel C0Label, C1Label, C2Label, C3Label, C4Label, C5Label, C6Label; // Save q2 for use as accumulator str(ARMEmitter::QReg::q2, ARMEmitter::Reg::rsp, -16); // save nzcv mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); // Range check: fall back for |x| >= 2^23, NaN, and inf. fabs(VTMP2.D(), VTMP1.D()); ldr(Accum.D(), &RangeLabel); fcmp(VTMP2.D(), Accum.D()); (void)b(ARMEmitter::Condition::CC_HS, &Fallback); // n = rint(x * (1/pi) + 0.5). ldr(Accum.D(), &InvPiLabel); fmov(ARMEmitter::ScalarRegSize::i64Bit, VTMP2, 0.5f); fmadd(VTMP2.D(), VTMP1.D(), Accum.D(), VTMP2.D()); frinta(VTMP2.D(), VTMP2.D()); // odd = (int(n) & 1) << 63. fcvtzs(ARMEmitter::Size::i64Bit, TMP1, VTMP2.D()); lsl(ARMEmitter::Size::i64Bit, TMP1, TMP1, 63); // Save input to Accum before overwriting VTMP1. fmov(Accum.D(), VTMP1.D()); // n = n - 0.5. fmov(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, 0.5f); fsub(VTMP2.D(), VTMP2.D(), VTMP1.D()); // r = x - n*pi (range reduction), in extended precision. ldr(VTMP1.D(), &Pi1Label); fmsub(Accum.D(), VTMP2.D(), VTMP1.D(), Accum.D()); ldr(VTMP1.D(), &Pi2Label); fmsub(Accum.D(), VTMP2.D(), VTMP1.D(), Accum.D()); ldr(VTMP1.D(), &Pi3Label); fmsub(Accum.D(), VTMP2.D(), VTMP1.D(), Accum.D()); // sin(r) poly approx. fmul(VTMP1.D(), Accum.D(), Accum.D()); fmov(ARMEmitter::Size::i64Bit, TMP2, Accum.D()); // Horner: p = c6 + r2*(c5 + r2*(... + r2*c0)). ldr(VTMP2.D(), &C6Label); ldr(Accum.D(), &C5Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C4Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C3Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C2Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C1Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C0Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); // y = r + r^3 * p. fmov(ARMEmitter::Size::i64Bit, Accum.D(), TMP2); fmul(VTMP1.D(), VTMP1.D(), Accum.D()); fmadd(Accum.D(), VTMP1.D(), VTMP2.D(), Accum.D()); // result = y XOR odd. fmov(ARMEmitter::Size::i64Bit, TMP2, Accum.D()); eor(ARMEmitter::Size::i64Bit, TMP2, TMP2, TMP1); fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), TMP2); // restore nzcv ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); // Restore q2 and return. ldr(ARMEmitter::QReg::q2, ARMEmitter::Reg::rsp, 16); ret(); // Fallback path. (void)Bind(&Fallback); ldr(ARMEmitter::QReg::q2, ARMEmitter::Reg::rsp, 16); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.FallbackHandlerPointers[FEXCore::Core::OPINDEX_F64COS].ABIHandler)); ldr(TMP4, STATE_PTR(CpuStateFrame, Pointers.FallbackHandlerPointers[FEXCore::Core::OPINDEX_F64COS].Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); ret(); // Constant pool. Align(16); (void)Bind(&InvPiLabel); dc64(0x3FD4'5F30'6DC9'C883ULL); // inv_pi (void)Bind(&Pi1Label); dc64(0x4009'21FB'5444'2D18ULL); // pi_1 (void)Bind(&Pi2Label); dc64(0x3CA1'A626'3314'5C06ULL); // pi_2 (void)Bind(&Pi3Label); dc64(0x395C'1CD1'2902'4E09ULL); // pi_3 (void)Bind(&C0Label); dc64(0xBFC5'5555'5555'547BULL); // c0 (void)Bind(&C1Label); dc64(0x3F81'1111'1110'8A4DULL); // c1 (void)Bind(&C2Label); dc64(0xBF2A'01A0'1993'6F27ULL); // c2 (void)Bind(&C3Label); dc64(0x3EC7'1DE3'7A97'D93EULL); // c3 (void)Bind(&C4Label); dc64(0xBE5A'E633'9199'87C6ULL); // c4 (void)Bind(&C5Label); dc64(0x3DE6'0E27'7AE0'7CECULL); // c5 (void)Bind(&C6Label); dc64(0xBD69'E954'0300'A100ULL); // c6 (void)Bind(&RangeLabel); dc64(0x4160'0000'0000'0000ULL); // 2^23 } void Dispatcher::EmitF64Tan() { F64TanHandlerAddress = GetCursorAddress(); constexpr auto Accum = ARMEmitter::VReg::v2; ARMEmitter::ForwardLabel Fallback, NonZero; ARMEmitter::ForwardLabel RangeLabel, TwoOverPiLabel; ARMEmitter::ForwardLabel HalfPi0Label, HalfPi1Label; ARMEmitter::ForwardLabel C0Label, C1Label, C2Label, C3Label, C4Label, C5Label, C6Label, C7Label, C8Label; // tan(+/-0) = +/-0 fmov(ARMEmitter::Size::i64Bit, TMP1, VTMP1.D()); lsl(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1); (void)cbnz(ARMEmitter::Size::i64Bit, TMP1, &NonZero); ret(); (void)Bind(&NonZero); // Save q2 for use as accumulator str(ARMEmitter::QReg::q2, ARMEmitter::Reg::rsp, -16); // save nzcv mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); // Range check: fall back for |x| >= 2^23, NaN, and inf. fabs(VTMP2.D(), VTMP1.D()); ldr(Accum.D(), &RangeLabel); fcmp(VTMP2.D(), Accum.D()); (void)b(ARMEmitter::Condition::CC_HS, &Fallback); // q = nearest integer to 2 * x / pi. ldr(VTMP2.D(), &TwoOverPiLabel); fmul(VTMP2.D(), VTMP1.D(), VTMP2.D()); frinta(VTMP2.D(), VTMP2.D()); // qi = int(q). fcvtzs(ARMEmitter::Size::i64Bit, TMP1, VTMP2.D()); // r = x - q * pi/2 (range reduction), in extended precision. fmov(Accum.D(), VTMP1.D()); ldr(VTMP1.D(), &HalfPi0Label); fmsub(Accum.D(), VTMP2.D(), VTMP1.D(), Accum.D()); ldr(VTMP1.D(), &HalfPi1Label); fmsub(Accum.D(), VTMP2.D(), VTMP1.D(), Accum.D()); // Further reduce r to [-pi/8, pi/8]. fmov(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, 0.5f); fmul(Accum.D(), Accum.D(), VTMP1.D()); // Approximate tan(r) using order 8 polynomial. fmul(VTMP1.D(), Accum.D(), Accum.D()); fmov(ARMEmitter::Size::i64Bit, TMP2, Accum.D()); // Horner: p = C8 + r2*(C7 + r2*(... + r2*C0)). ldr(VTMP2.D(), &C8Label); ldr(Accum.D(), &C7Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C6Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C5Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C4Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C3Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C2Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C1Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); ldr(Accum.D(), &C0Label); fmadd(VTMP2.D(), VTMP1.D(), VTMP2.D(), Accum.D()); // p = r + r^3 * p. fmov(ARMEmitter::Size::i64Bit, Accum.D(), TMP2); fmul(VTMP1.D(), VTMP1.D(), Accum.D()); fmadd(Accum.D(), VTMP1.D(), VTMP2.D(), Accum.D()); // Double-angle reconstruction: tan(2x) = 2*tan(x) / (1 - tan^2(x)). fadd(VTMP1.D(), Accum.D(), Accum.D()); fmul(VTMP2.D(), Accum.D(), Accum.D()); fmov(ARMEmitter::ScalarRegSize::i64Bit, Accum, 1.0f); fsub(VTMP2.D(), VTMP2.D(), Accum.D()); ARMEmitter::ForwardLabel SkipSwap; (void)tbnz(TMP1, 0, &SkipSwap); fneg(Accum.D(), VTMP1.D()); fmov(VTMP1.D(), VTMP2.D()); fmov(VTMP2.D(), Accum.D()); (void)Bind(&SkipSwap); // result = numerator / denominator -> VTMP1. fdiv(VTMP1.D(), VTMP2.D(), VTMP1.D()); // restore nzcv ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); // Restore q2 and return. ldr(ARMEmitter::QReg::q2, ARMEmitter::Reg::rsp, 16); ret(); // Fallback path. (void)Bind(&Fallback); ldr(ARMEmitter::QReg::q2, ARMEmitter::Reg::rsp, 16); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.FallbackHandlerPointers[FEXCore::Core::OPINDEX_F64TAN].ABIHandler)); ldr(TMP4, STATE_PTR(CpuStateFrame, Pointers.FallbackHandlerPointers[FEXCore::Core::OPINDEX_F64TAN].Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); ret(); // Constant pool. Align(16); (void)Bind(&TwoOverPiLabel); dc64(0x3FE4'5F30'6DC9'C883ULL); // two_over_pi (void)Bind(&HalfPi0Label); dc64(0x3FF9'21FB'5444'2D18ULL); // half_pi[0] (void)Bind(&HalfPi1Label); dc64(0x3C91'A626'3314'5C07ULL); // half_pi[1] (void)Bind(&C0Label); dc64(0x3FD5'5555'5555'5556ULL); // C0 (void)Bind(&C1Label); dc64(0x3FC1'1111'1111'0A63ULL); // C1 (void)Bind(&C2Label); dc64(0x3FAB'A1BA'1BB4'6414ULL); // C2 (void)Bind(&C3Label); dc64(0x3F96'64F4'7E5B'5445ULL); // C3 (void)Bind(&C4Label); dc64(0x3F82'26E5'E5EC'DFA3ULL); // C4 (void)Bind(&C5Label); dc64(0x3F6D'6C7D'DBF8'7047ULL); // C5 (void)Bind(&C6Label); dc64(0x3F57'EA75'D05B'583EULL); // C6 (void)Bind(&C7Label); dc64(0x3F42'89F2'2964'A03CULL); // C7 (void)Bind(&C8Label); dc64(0x3F34'E4FD'1414'7622ULL); // C8 (void)Bind(&RangeLabel); dc64(0x4160'0000'0000'0000ULL); // 2^23 } uint64_t Dispatcher::GenerateABICall(FallbackABI ABI) { auto Address = GetCursorAddress(); constexpr static auto FallbackPointerReg = TMP4; constexpr static auto ABI1 = ARMEmitter::XReg::x0; constexpr static auto ABI2 = ARMEmitter::XReg::x1; constexpr static auto ABI3 = ARMEmitter::XReg::x2; constexpr static auto VABI1 = ARMEmitter::VReg::v0; constexpr static auto VABI2 = ARMEmitter::VReg::v1; auto FillF80x2Result = [&]() { if (!TMP_ABIARGS) { mov(VTMP1.Q(), VABI1.Q()); mov(VTMP2.Q(), VABI2.Q()); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillF64x2Result = [&]() { if (!TMP_ABIARGS) { fmov(VTMP1.D(), VABI1.D()); fmov(VTMP2.D(), VABI2.D()); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillF80Result = [&]() { if (VTMP1 != VABI1) { mov(VTMP1.Q(), VABI1.Q()); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillF64Result = [&]() { if (!TMP_ABIARGS) { fmov(VTMP1.D(), VABI1.D()); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillF32Result = [&]() { if (!TMP_ABIARGS) { fmov(VTMP1.S(), VABI1.S()); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillI64Result = [&]() { if (!TMP_ABIARGS) { mov(TMP1, ARMEmitter::XReg::x0); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillI32Result = [&]() { if (!TMP_ABIARGS) { mov(TMP1.W(), ARMEmitter::WReg::w0); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; auto FillI16Result = [&]() { if (!TMP_ABIARGS) { mov(TMP1, ARMEmitter::XReg::x0); } FillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, true); }; switch (ABI) { case FABI_F80_I16_F32_PTR: { // Save NZCV - it's a static register (guest x86 flags) and the inline code clobbers it mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); EmitF32ToExtF80(); ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); } break; case FABI_F80_I16_F64_PTR: { mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); EmitF64ToExtF80(); ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); } break; case FABI_F80_I16_I16_PTR: { mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); EmitI16ToExtF80(); ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); } break; case FABI_F80_I16_I32_PTR: { mrs(TMP1, ARMEmitter::SystemRegister::NZCV); str(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); EmitI32ToExtF80(); ldr(TMP1.W(), STATE.R(), offsetof(FEXCore::Core::CpuStateFrame, State.flags[24])); msr(ARMEmitter::SystemRegister::NZCV, TMP1); } break; case FABI_F32_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF32Result(); } break; case FABI_F64_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF64Result(); } break; case FABI_F64_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); if (!TMP_ABIARGS) { fmov(VABI1.D(), VTMP1.D()); } mov(ARMEmitter::XReg::x0, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF64Result(); } break; case FABI_F64_F64_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); if (!TMP_ABIARGS) { fmov(VABI1.D(), VTMP1.D()); fmov(VABI2.D(), VTMP2.D()); } mov(ARMEmitter::XReg::x0, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF64Result(); } break; case FABI_I16_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillI16Result(); } break; case FABI_I32_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillI32Result(); } break; case FABI_I64_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillI64Result(); } break; case FABI_I64_I16_F80_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); mov(VABI2.Q(), VTMP2.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillI64Result(); } break; case FABI_F80_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); mov(ARMEmitter::XReg::x1, STATE); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF80Result(); } break; case FABI_F80_I16_F80_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); mov(VABI2.Q(), VTMP2.Q()); } mov(ARMEmitter::XReg::x1, STATE); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF80Result(); } break; case FABI_F80x2_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v16): vector source 2 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); mov(ARMEmitter::XReg::x1, STATE); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { // GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF80x2Result(); } break; case FABI_F64x2_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v16): vector source 2 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); mov(ARMEmitter::XReg::x0, STATE); if (!TMP_ABIARGS) { fmov(VABI1.D(), VTMP1.D()); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { // GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillF64x2Result(); } break; case FABI_I32_I64_I64_V128_V128_I16: { // Linux Reg/Win32 Reg: // stack: FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 // tmp1 (x0/x10): source 1 // tmp2 (x1/x11): source 2 // tmp3 (x2/x12): source 3 const size_t OriginalSPOffset = SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP4, true); // Load the Fallback handler pointer from the stack. ldr(FallbackPointerReg, ARMEmitter::XReg::rsp, OriginalSPOffset); if (!TMP_ABIARGS) { mov(ABI1, TMP1); mov(ABI2, TMP2); mov(ABI3, TMP3); mov(VABI1.Q(), VTMP1.Q()); mov(VABI2.Q(), VTMP2.Q()); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillI32Result(); } break; case FABI_I32_V128_V128_I16: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 // tmp1 (x0/x10): source 1 SpillForABICall(CTX->HostFeatures.SupportsPreserveAllABI, TMP3, true); if (!TMP_ABIARGS) { mov(VABI1.Q(), VTMP1.Q()); mov(VABI2.Q(), VTMP2.Q()); mov(ABI1, TMP1); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(FallbackPointerReg); } else { blr(FallbackPointerReg); } FillI32Result(); } break; case FABI_UNKNOWN: default: #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_MSG_A_FMT("Unhandled IR Fallback ABI: {}", ToUnderlying(ABI)); #endif break; } // Return to JIT ret(); return Address; } void Dispatcher::InitThreadPointers(FEXCore::Core::InternalThreadState* Thread) { // Setup dispatcher specific pointers that need to be accessed from JIT code { auto& Ptrs = Thread->CurrentFrame->Pointers; Ptrs.DispatcherLoopTop = AbsoluteLoopTopAddress; Ptrs.DispatcherLoopTopFillSRA = AbsoluteLoopTopAddressFillSRA; Ptrs.DispatcherLoopTopEnterEC = AbsoluteLoopTopAddressEnterEC; Ptrs.DispatcherLoopTopEnterECFillSRA = AbsoluteLoopTopAddressEnterECFillSRA; Ptrs.ExitFunctionLinker = ExitFunctionLinkerAddress; Ptrs.ThreadStopHandlerSpillSRA = ThreadStopHandlerAddressSpillSRA; Ptrs.ThreadPauseHandlerSpillSRA = ThreadPauseHandlerAddressSpillSRA; Ptrs.GuestSignal_SIGILL = GuestSignal_SIGILL; Ptrs.GuestSignal_SIGTRAP = GuestSignal_SIGTRAP; Ptrs.GuestSignal_SIGSEGV = GuestSignal_SIGSEGV; Ptrs.SignalReturnHandler = SignalHandlerReturnAddress; Ptrs.SignalReturnHandlerRT = SignalHandlerReturnAddressRT; Ptrs.LUDIVHandler = LUDIVHandlerAddress; Ptrs.LDIVHandler = LDIVHandlerAddress; Ptrs.F64SinHandler = F64SinHandlerAddress; Ptrs.F64CosHandler = F64CosHandlerAddress; Ptrs.F64TanHandler = F64TanHandlerAddress; // Fill in the fallback handlers InterpreterOps::FillFallbackIndexPointers(Ptrs.FallbackHandlerPointers, &ABIPointers[0]); } } SignalDelegatorConfig Dispatcher::MakeSignalDelegatorConfig() const { // PF/AF are the final two SRA registers. We only want GPRs const auto GPRCount = uint16_t(StaticRegisters.size() - 2); const auto FPRCount = uint16_t(StaticFPRegisters.size()); const auto GetSRAGPRMapping = [GPRCount, this] { SignalDelegatorConfig::SRAIndexMapping Mapping {}; for (size_t i = 0; i < GPRCount; ++i) { Mapping[i] = StaticRegisters[i].Idx(); } return Mapping; }; const auto GetSRAFPRMapping = [FPRCount, this] { SignalDelegatorConfig::SRAIndexMapping Mapping {}; for (size_t i = 0; i < FPRCount; ++i) { Mapping[i] = StaticFPRegisters[i].Idx(); } return Mapping; }; return FEXCore::SignalDelegatorConfig { .DispatcherBegin = Start, .DispatcherEnd = End, .AbsoluteLoopTopAddress = AbsoluteLoopTopAddress, .AbsoluteLoopTopAddressFillSRA = AbsoluteLoopTopAddressFillSRA, .SignalHandlerReturnAddress = SignalHandlerReturnAddress, .SignalHandlerReturnAddressRT = SignalHandlerReturnAddressRT, .PauseReturnInstruction = PauseReturnInstruction, .ThreadPauseHandlerAddressSpillSRA = ThreadPauseHandlerAddressSpillSRA, .ThreadPauseHandlerAddress = ThreadPauseHandlerAddress, // Stop handlers. .ThreadStopHandlerAddressSpillSRA = ThreadStopHandlerAddressSpillSRA, .ThreadStopHandlerAddress = ThreadStopHandlerAddress, // SRA information. .SRAGPRCount = GPRCount, .SRAFPRCount = FPRCount, .SRAGPRMapping = GetSRAGPRMapping(), .SRAFPRMapping = GetSRAFPRMapping(), }; } fextl::unique_ptr Dispatcher::Create(FEXCore::Context::ContextImpl* CTX) { return fextl::make_unique(CTX); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/Interpreter/InterpreterOps.h" #include #include #include #include #include namespace FEXCore { struct GuestSigAction; struct SignalDelegatorConfig; } // namespace FEXCore namespace FEXCore::Core { struct CpuStateFrame; struct InternalThreadState; } // namespace FEXCore::Core namespace FEXCore::Context { class ContextImpl; } namespace FEXCore::CPU { #define STATE_PTR(STATE_TYPE, FIELD) STATE.R(), offsetof(FEXCore::Core::STATE_TYPE, FIELD) #define STATE_PTR_IDX(STATE_TYPE, FIELD, INDEX) STATE.R(), ARRAY_OFFSETOF(FEXCore::Core::STATE_TYPE, FIELD, INDEX) #define FALLBACK_HANDLER_OFFSET(INDEX, FIELD) \ STATE.R(), \ (ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, Pointers.FallbackHandlerPointers, INDEX) + offsetof(FEXCore::Core::FallbackABIInfo, FIELD)) class Dispatcher final : public Arm64Emitter { public: static fextl::unique_ptr Create(FEXCore::Context::ContextImpl* CTX); Dispatcher(FEXCore::Context::ContextImpl* ctx); ~Dispatcher(); void InitThreadPointers(FEXCore::Core::InternalThreadState* Thread); #ifdef VIXL_SIMULATOR void ExecuteDispatch(FEXCore::Core::CpuStateFrame* Frame); void ExecuteJITCallback(FEXCore::Core::CpuStateFrame* Frame, uint64_t RIP); #else void ExecuteDispatch(FEXCore::Core::CpuStateFrame* Frame) { DispatchPtr(Frame, false); } void ExecuteJITCallback(FEXCore::Core::CpuStateFrame* Frame, uint64_t RIP) { CallbackPtr(Frame, RIP); } #endif uint64_t GetExitFunctionLinkerAddress() const { return ExitFunctionLinkerAddress; } SignalDelegatorConfig MakeSignalDelegatorConfig() const; protected: FEXCore::Context::ContextImpl* CTX; using AsmDispatch = void (*)(FEXCore::Core::CpuStateFrame* Frame, bool SingleInst); using JITCallback = void (*)(FEXCore::Core::CpuStateFrame* Frame, uint64_t RIP); AsmDispatch DispatchPtr; JITCallback CallbackPtr; private: /** * @name Dispatch Helper functions * @{ */ uint64_t ThreadStopHandlerAddress {}; uint64_t ThreadStopHandlerAddressSpillSRA {}; uint64_t AbsoluteLoopTopAddress {}; uint64_t AbsoluteLoopTopAddressFillSRA {}; uint64_t AbsoluteLoopTopAddressEnterEC {}; uint64_t AbsoluteLoopTopAddressEnterECFillSRA {}; uint64_t ThreadPauseHandlerAddress {}; uint64_t ThreadPauseHandlerAddressSpillSRA {}; uint64_t ExitFunctionLinkerAddress {}; uint64_t SignalHandlerReturnAddress {}; uint64_t SignalHandlerReturnAddressRT {}; uint64_t GuestSignal_SIGILL {}; uint64_t GuestSignal_SIGTRAP {}; uint64_t GuestSignal_SIGSEGV {}; uint64_t PauseReturnInstruction {}; std::array ABIPointers {}; /** @} */ uint64_t Start {}; uint64_t End {}; // Long division helpers uint64_t LUDIVHandlerAddress {}; uint64_t LDIVHandlerAddress {}; // F64 trig shared handlers uint64_t F64SinHandlerAddress {}; uint64_t F64CosHandlerAddress {}; uint64_t F64TanHandlerAddress {}; void EmitDispatcher(); uint64_t GenerateABICall(FallbackABI ABI); // Inline softfloat conversion emitters - avoid FPCR save/restore overhead // These emit ARM64 code that performs the conversion using only integer ops void EmitI16ToExtF80(); void EmitI32ToExtF80(); void EmitF32ToExtF80(); void EmitF64ToExtF80(); void EmitF64Sin(); void EmitF64Cos(); void EmitF64Tan(); FEX_CONFIG_OPT(DisableL2Cache, DISABLEL2CACHE); }; } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Frontend.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-meta-blocks desc: Extracts instruction & block meta info, frontend multiblock logic $end_info$ */ #include "Interface/Context/Context.h" #include "Interface/Core/Frontend.h" #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/LookupCache.h" #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::Frontend { #include "Interface/Core/VSyscall/VSyscall.inc" using namespace FEXCore::X86Tables; static uint32_t MapModRMToReg(uint8_t REX, uint8_t bits, bool HighBits, bool HasREX, bool HasXMM, bool HasMM, uint8_t InvalidOffset = 16) { using GPRArray = std::array; static constexpr GPRArray GPR8BitHighIndexes = { // Classical ordering? FEXCore::X86State::REG_RAX, FEXCore::X86State::REG_RCX, FEXCore::X86State::REG_RDX, FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RAX, FEXCore::X86State::REG_RCX, FEXCore::X86State::REG_RDX, FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_R8, FEXCore::X86State::REG_R9, FEXCore::X86State::REG_R10, FEXCore::X86State::REG_R11, FEXCore::X86State::REG_R12, FEXCore::X86State::REG_R13, FEXCore::X86State::REG_R14, FEXCore::X86State::REG_R15, }; uint8_t Offset = (REX << 3) | bits; if (Offset == InvalidOffset) { return FEXCore::X86State::REG_INVALID; } if (HasXMM) { return FEXCore::X86State::REG_XMM_0 + Offset; } else if (HasMM) { return FEXCore::X86State::REG_MM_0 + bits; // Ignore REX extension for MMX registers } else if (!(HighBits && !HasREX)) { return FEXCore::X86State::REG_RAX + Offset; } return GPR8BitHighIndexes[Offset]; } static uint32_t MapVEXToReg(uint8_t vvvv, bool HasXMM) { if (HasXMM) { return FEXCore::X86State::REG_XMM_0 + vvvv; } else { return FEXCore::X86State::REG_RAX + vvvv; } } Decoder::Decoder(FEXCore::Core::InternalThreadState* Thread) : Thread {Thread} , CTX {static_cast(Thread->CTX)} , OSABI {CTX->SyscallHandler ? CTX->SyscallHandler->GetOSABI() : FEXCore::HLE::SyscallOSABI::OS_UNKNOWN} , PoolObject {CTX->FrontendAllocator, sizeof(FEXCore::X86Tables::DecodedInst) * DefaultDecodedBufferSize} { FEX_CONFIG_OPT(ReducedPrecision, X87REDUCEDPRECISION); if (ReducedPrecision) { X87Table = &FEXCore::X86Tables::X87F64Ops; } else { X87Table = &FEXCore::X86Tables::X87F80Ops; } if (CTX->HostFeatures.SupportsAVX && CTX->HostFeatures.SupportsSVE256) { VEXTable = &FEXCore::X86Tables::VEXTableOps; VEXTableGroup = &FEXCore::X86Tables::VEXTableGroupOps; } else if (CTX->HostFeatures.SupportsAVX) { VEXTable = &FEXCore::X86Tables::VEXTableOps_AVX128; VEXTableGroup = &FEXCore::X86Tables::VEXTableGroupOps_AVX128; } } bool Decoder::CheckRangeExecutable(uint64_t Address, uint64_t Size) { while (Address < ExecutableRangeBase || Address + Size > ExecutableRangeEnd) { auto RangeInfo = CTX->SyscallHandler->QueryGuestExecutableRange(Thread, Address); ExecutableRangeBase = RangeInfo.Base; ExecutableRangeEnd = RangeInfo.Base + RangeInfo.Size; ExecutableRangeWritable = RangeInfo.Writable; if (RangeInfo.Size == 0) { return false; } uint64_t RangeRemainingSize = ExecutableRangeEnd - Address; if (Size > RangeRemainingSize) { Size -= RangeRemainingSize; Address += RangeRemainingSize; } } return true; } uint8_t Decoder::ReadByte() { LOGMAN_THROW_A_FMT(InstructionSize < MAX_INST_SIZE, "Max instruction size exceeded!"); std::optional Byte = PeekByte(0); if (!Byte) { HitNonExecutableRange = true; // Pretend we read 0, the main decode loop will see HitNonExecutableRange and rollback the instruction. return 0; } Instruction[InstructionSize] = *Byte; InstructionSize++; return *Byte; } std::optional Decoder::PeekByte(uint8_t Offset) { uint64_t ByteAddress = reinterpret_cast(InstStream + InstructionSize + Offset); if (CheckRangeExecutable(ByteAddress, 1)) { return InstStream[InstructionSize + Offset]; } else { return std::nullopt; } } std::pair Decoder::ReadData(uint8_t Size) { LOGMAN_THROW_A_FMT(Size != 0 && Size <= sizeof(uint64_t), "Unknown data size to read"); uint64_t Res = 0; uint64_t Address = reinterpret_cast(InstStream + InstructionSize); if (CheckRangeExecutable(Address, Size)) { std::memcpy(&Res, &InstStream[InstructionSize], Size); } else { HitNonExecutableRange = true; // See PeekByte, this specific case may cause some executable memory to read as 0 but it doesn't matter as the entire instruction will be rolled back anyway. Res = 0; } #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED for (size_t i = 0; i < Size; ++i) { ReadByte(); } #else SkipBytes(Size); #endif if (Relocations) { uint32_t SectionOffset = static_cast(Address - SectionMinAddress); if (auto It = Relocations->find(SectionOffset); It != Relocations->end()) { if (It->second == GuestRelocationType::Rel32 && Size == 4) { return {static_cast(static_cast(Res) - static_cast(EntryPoint)), true}; } else if (It->second == GuestRelocationType::Rel64 && Size == 8) { return {static_cast(Res) - static_cast(EntryPoint), true}; } else { HitBadRelocation = true; Res = 0; } } } return {Res, false}; } void Decoder::DecodeModRM_16(X86Tables::DecodedOperand* Operand, X86Tables::ModRMDecoded ModRM) { // 16bit modrm behaves similar to SIB but encoded directly in modrm // mod != 0b11 case // RM | Result // =============== // 0b000 | [BX + SI] // 0b001 | [BX + DI] // 0b010 | [BP + SI] // 0b011 | [BP + DI] // 0b100 | [SI] // 0b101 | [DI] // 0b110 | {[BP], disp16} // 0b111 | [BX] // if mod = 0b00 // 0b110 = disp16 // if mod = 0b01 // All encodings gain 8bit displacement // 0b110 = [BP] + disp8 // if mod = 0b10 // All encodings gain 16bit displacement // 0b110 = [BP] + disp16 uint32_t Literal {}; uint8_t DisplacementSize {}; if ((ModRM.mod == 0 && ModRM.rm == 0b110) || ModRM.mod == 0b10) { DisplacementSize = 2; } else if (ModRM.mod == 0b01) { DisplacementSize = 1; } if (DisplacementSize) { bool IsRelocation = false; std::tie(Literal, IsRelocation) = ReadData(DisplacementSize); LOGMAN_THROW_A_FMT(!IsRelocation, "1/2 byte relocations unsupported"); if (DisplacementSize == 1) { Literal = static_cast(Literal); } } Operand->Type = DecodedOperand::OpType::SIB; Operand->Data.SIB.Scale = 1; Operand->Data.SIB.Offset = Literal; // Only called when ModRM.mod != 0b11 struct Encodings { uint8_t Base; uint8_t Index; }; constexpr static std::array Lookup = {{ // Mod = 0b00 {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RSI}, {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RDI}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_RSI}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_RDI}, {FEXCore::X86State::REG_RSI, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RDI, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_INVALID, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_INVALID}, // Mod = 0b01 {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RSI}, {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RDI}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_RSI}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_RDI}, {FEXCore::X86State::REG_RSI, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RDI, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_INVALID}, // Mod = 0b10 {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RSI}, {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RDI}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_RSI}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_RDI}, {FEXCore::X86State::REG_RSI, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RDI, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RBP, FEXCore::X86State::REG_INVALID}, {FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_INVALID}, }}; uint8_t LookupIndex = ModRM.mod << 3 | ModRM.rm; auto it = Lookup[LookupIndex]; Operand->Data.SIB.Base = it.Base; Operand->Data.SIB.Index = it.Index; } void Decoder::DecodeModRM_64(X86Tables::DecodedOperand* Operand, X86Tables::ModRMDecoded ModRM) { uint8_t Displacement {}; // Do we have an offset? if (ModRM.mod == 0b01) { Displacement = 1; } else if (ModRM.mod == 0b10) { Displacement = 4; } else if (ModRM.mod == 0 && ModRM.rm == 0b101) { Displacement = 4; } // Calculate SIB bool HasSIB = ((ModRM.mod != 0b11) && (ModRM.rm == 0b100)); if (HasSIB) { FEXCore::X86Tables::SIBDecoded SIB; if (DecodeInst->Flags & DecodeFlags::FLAG_DECODED_SIB) { SIB.Hex = DecodeInst->SIB; } else { // Haven't yet grabbed SIB, pull it now DecodeInst->SIB = ReadByte(); SIB.Hex = DecodeInst->SIB; DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_SIB; } // If the SIB base is 0b101, aka BP or R13 then we have a 32bit displacement if (ModRM.mod == 0b00 && ModRM.rm == 0b100 && SIB.base == 0b101) { Displacement = 4; } // SIB Operand->Type = DecodedOperand::OpType::SIB; Operand->Data.SIB.Scale = 1 << SIB.scale; // The invalid encoding types are described at Table 1-12. "promoted nsigned is always non-zero" { // If we have a VSIB byte (as opposed to SIB), then the index register is a vector. // DecodeInst->TableInfo may be null in the case of 3DNow! ModRM decoding. const bool IsIndexVector = DecodeInst->TableInfo && (DecodeInst->TableInfo->Flags & InstFlags::FLAGS_VEX_VSIB) != 0; uint8_t InvalidSIBIndex = 0b100; ///< SIB Index where there is no register encoding. if (IsIndexVector) { DecodeInst->Flags |= X86Tables::DecodeFlags::FLAG_VSIB_BYTE; InvalidSIBIndex = ~0; ///< No Invalid SIB Index with Index Vectors. } const uint8_t IndexREX = (DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_X) != 0 ? 1 : 0; const uint8_t BaseREX = (DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_B) != 0 ? 1 : 0; Operand->Data.SIB.Index = MapModRMToReg(IndexREX, SIB.index, false, false, IsIndexVector, false, InvalidSIBIndex); Operand->Data.SIB.Base = MapModRMToReg(BaseREX, SIB.base, false, false, false, false, ModRM.mod == 0 ? 0b101 : 16); } LOGMAN_THROW_A_FMT(Displacement <= 4, "Number of bytes should be <= 4 for literal src"); if (Displacement) { auto [Literal, IsRelocation] = ReadData(Displacement); if (IsRelocation) { Operand->Type = DecodedOperand::OpType::SIBRelocation; } if (Displacement == 1) { Literal = static_cast(Literal); } Operand->Data.SIB.Offset = Literal; } } else if (ModRM.mod == 0) { // Explained in Table 1-14. "Operand Addressing Using ModRM and SIB Bytes" if (ModRM.rm == 0b101) { // 32bit Displacement auto [Literal, IsRelocation] = ReadData(4); Operand->Type = IsRelocation ? DecodedOperand::OpType::RIPRelativeRelocation : DecodedOperand::OpType::RIPRelative; Operand->Data.RIPLiteral.Value = Literal; } else { // Register-direct addressing Operand->Type = DecodedOperand::OpType::GPRDirect; Operand->Data.GPR.GPR = MapModRMToReg(DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_B ? 1 : 0, ModRM.rm, false, false, false, false); } } else { uint8_t DisplacementSize = ModRM.mod == 1 ? 1 : 4; auto [Literal, IsRelocation] = ReadData(DisplacementSize); if (DisplacementSize == 1) { Literal = static_cast(Literal); } Operand->Type = IsRelocation ? DecodedOperand::OpType::GPRIndirectRelocation : DecodedOperand::OpType::GPRIndirect; Operand->Data.GPRIndirect.GPR = MapModRMToReg(DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_B ? 1 : 0, ModRM.rm, false, false, false, false); Operand->Data.GPRIndirect.Displacement = Literal; } } bool Decoder::NormalOp(const FEXCore::X86Tables::X86InstInfo* Info, uint16_t Op, DecodedHeader Options) { if (Info->Type == FEXCore::X86Tables::TYPE_ARCH_DISPATCHER) [[unlikely]] { // Dispatcher Op. // TODO: Move this in to `NormalOpHeader`, Dispatch tables have a bug currently where some subtables don't inherit flags correctly. // Can be seen by running FEX asm tests if this is removed. return NormalOp(&Info->OpcodeDispatcher.Indirect[BlockInfo.Is64BitMode ? 1 : 0], Op); } DecodeInst->OP = Op; DecodeInst->TableInfo = Info; if (Info->Type == FEXCore::X86Tables::TYPE_UNKNOWN) { return false; } if (Info->Type == FEXCore::X86Tables::TYPE_INVALID) { return false; } LOGMAN_THROW_A_FMT(!(Info->Type >= FEXCore::X86Tables::TYPE_GROUP_1 && Info->Type <= FEXCore::X86Tables::TYPE_GROUP_P), "Group Ops " "should have " "been decoded " "before this!"); uint8_t DestSize {}; const bool HasWideningDisplacement = (FEXCore::X86Tables::DecodeFlags::GetOpAddr(DecodeInst->Flags, 0) & FEXCore::X86Tables::DecodeFlags::FLAG_WIDENING_SIZE_LAST) != 0 || (Options.w && BlockInfo.Is64BitMode); const bool HasNarrowingDisplacement = (FEXCore::X86Tables::DecodeFlags::GetOpAddr(DecodeInst->Flags, 0) & FEXCore::X86Tables::DecodeFlags::FLAG_OPERAND_SIZE_LAST) != 0; const bool HasXMMFlags = (Info->Flags & InstFlags::FLAGS_XMM_FLAGS) != 0; bool HasXMMSrc = HasXMMFlags && !HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_SRC_GPR) && !HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_MMX_SRC); bool HasXMMDst = HasXMMFlags && !HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_DST_GPR) && !HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_MMX_DST); bool HasMMSrc = HasXMMFlags && !HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_SRC_GPR) && HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_MMX_SRC); bool HasMMDst = HasXMMFlags && !HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_DST_GPR) && HAS_XMM_SUBFLAG(Info->Flags, InstFlags::FLAGS_SF_MMX_DST); // Is ModRM present via explicit instruction encoded or REX? const bool HasMODRM = !!(Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_MODRM); const bool HasREX = !!(DecodeInst->Flags & DecodeFlags::FLAG_REX_PREFIX); const bool Has16BitAddressing = !BlockInfo.Is64BitMode && DecodeInst->Flags & DecodeFlags::FLAG_ADDRESS_SIZE; if (Options.w && (Info->Flags & InstFlags::FLAGS_REX_W_0)) { return false; } else if (!Options.w && (Info->Flags & InstFlags::FLAGS_REX_W_1)) { return false; } if (Options.L && (Info->Flags & InstFlags::FLAGS_VEX_L_0)) { return false; } else if (!Options.L && (Info->Flags & InstFlags::FLAGS_VEX_L_1)) { return false; } const bool UseVEXL = Options.L && !(Info->Flags & InstFlags::FLAGS_VEX_L_IGNORE); // This is used for ModRM register modification // For both modrm.reg and modrm.rm(when mod == 0b11) when value is >= 0b100 // then it changes from expected registers to the high 8bits of the lower registers // Bit annoying to support // In the case of no modrm (REX in byte situation) then it is unaffected bool Is8BitSrc {}; bool Is8BitDest {}; // If we require ModRM and haven't decoded it yet, do it now // Some instructions have to read modrm upfront, others do it later if (HasMODRM && !(DecodeInst->Flags & DecodeFlags::FLAG_DECODED_MODRM)) { DecodeInst->ModRM = ReadByte(); DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_MODRM; } // New instruction size decoding { // Decode destinations first const auto DstSizeFlag = FEXCore::X86Tables::InstFlags::GetSizeDstFlags(Info->Flags); const auto SrcSizeFlag = FEXCore::X86Tables::InstFlags::GetSizeSrcFlags(Info->Flags); if (DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_8BIT) { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_8BIT); DestSize = 1; Is8BitDest = true; } else if (DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_16BIT) { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_16BIT); DestSize = 2; } else if (DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_128BIT) { if (UseVEXL) { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_256BIT); DestSize = 32; } else { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_128BIT); DestSize = 16; } } else if (DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_256BIT) { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_256BIT); DestSize = 32; } else if (HasNarrowingDisplacement && (DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_DEF || DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_64BITDEF)) { // See table 1-2. Operand-Size Overrides for this decoding // If the default operating mode is 32bit and we have the operand size flag then the operating size drops to 16bit DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_16BIT); DestSize = 2; } else if ((HasXMMDst || HasMMDst || BlockInfo.Is64BitMode) && (HasWideningDisplacement || DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_64BIT || DstSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_64BITDEF)) { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_64BIT); DestSize = 8; } else { DecodeInst->Flags |= DecodeFlags::GenSizeDstSize(DecodeFlags::SIZE_32BIT); DestSize = 4; } // Decode sources if (SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_8BIT) { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_8BIT); Is8BitSrc = true; } else if (SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_16BIT) { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_16BIT); } else if (SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_128BIT) { if (UseVEXL) { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_256BIT); } else { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_128BIT); } } else if (SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_256BIT) { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_256BIT); } else if (HasNarrowingDisplacement && (SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_DEF || SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_64BITDEF)) { // See table 1-2. Operand-Size Overrides for this decoding // If the default operating mode is 32bit and we have the operand size flag then the operating size drops to 16bit DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_16BIT); } else if ((HasXMMSrc || HasMMSrc || BlockInfo.Is64BitMode) && (HasWideningDisplacement || SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_64BIT || SrcSizeFlag == FEXCore::X86Tables::InstFlags::SIZE_64BITDEF)) { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_64BIT); } else { DecodeInst->Flags |= DecodeFlags::GenSizeSrcSize(DecodeFlags::SIZE_32BIT); } } auto* CurrentDest = &DecodeInst->Dest; if (HAS_NON_XMM_SUBFLAG(Info->Flags, FEXCore::X86Tables::InstFlags::FLAGS_SF_DST_RAX) || HAS_NON_XMM_SUBFLAG(Info->Flags, FEXCore::X86Tables::InstFlags::FLAGS_SF_DST_RDX)) { // Some instructions hardcode their destination as RAX CurrentDest->Type = DecodedOperand::OpType::GPR; CurrentDest->Data.GPR.HighBits = false; CurrentDest->Data.GPR.GPR = HAS_NON_XMM_SUBFLAG(Info->Flags, FEXCore::X86Tables::InstFlags::FLAGS_SF_DST_RAX) ? FEXCore::X86State::REG_RAX : FEXCore::X86State::REG_RDX; CurrentDest = &DecodeInst->Src[0]; } else if (HAS_NON_XMM_SUBFLAG(Info->Flags, FEXCore::X86Tables::InstFlags::FLAGS_SF_REX_IN_BYTE)) { LOGMAN_THROW_A_FMT(!HasMODRM, "This instruction shouldn't have ModRM!"); // If the REX is in the byte that means the lower nibble of the OP contains the destination GPR // This also means that the destination is always a GPR on these ones // ADDITIONALLY: // If there is a REX prefix then that allows extended GPR usage CurrentDest->Type = DecodedOperand::OpType::GPR; DecodeInst->Dest.Data.GPR.HighBits = (Is8BitDest && !HasREX && (Op & 0b111) >= 0b100); CurrentDest->Data.GPR.GPR = MapModRMToReg(DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_B ? 1 : 0, Op & 0b111, Is8BitDest, HasREX, false, false); if (CurrentDest->Data.GPR.GPR == FEXCore::X86State::REG_INVALID) { return false; } } uint8_t Bytes = Info->MoreBytes; if ((Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_DISPLACE_SIZE_MUL_2) && HasWideningDisplacement) { Bytes <<= 1; } if ((Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_DISPLACE_SIZE_DIV_2) && HasNarrowingDisplacement) { Bytes >>= 1; } if ((Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_MEM_OFFSET) && (DecodeInst->Flags & DecodeFlags::FLAG_ADDRESS_SIZE)) { // If we have a memory offset and have the address size override then divide it just like narrowing displacement Bytes >>= 1; } auto ModRMOperand = [&](FEXCore::X86Tables::DecodedOperand& GPR, FEXCore::X86Tables::DecodedOperand& NonGPR, bool HasXMMGPR, bool HasXMMNonGPR, bool HasMMGPR, bool HasMMNonGPR, bool GPR8Bit, bool NonGPR8Bit) { FEXCore::X86Tables::ModRMDecoded ModRM; ModRM.Hex = DecodeInst->ModRM; if (ModRM.reg != 0b000 && (Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SF_MOD_ZERO_REG)) { return false; } if (ModRM.mod == 0b11 && (Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SF_MOD_MEM_ONLY)) { return false; } if (ModRM.mod != 0b11 && (Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SF_MOD_REG_ONLY)) { return false; } // Decode the GPR source first GPR.Type = DecodedOperand::OpType::GPR; GPR.Data.GPR.HighBits = (GPR8Bit && ModRM.reg >= 0b100 && !HasREX); GPR.Data.GPR.GPR = MapModRMToReg(DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_R ? 1 : 0, ModRM.reg, GPR8Bit, HasREX, HasXMMGPR, HasMMGPR); if (GPR.Data.GPR.GPR == FEXCore::X86State::REG_INVALID) { return false; } // ModRM.mod == 0b11 == Register // ModRM.Mod != 0b11 == Register-direct addressing if (ModRM.mod == 0b11) { NonGPR.Type = DecodedOperand::OpType::GPR; NonGPR.Data.GPR.HighBits = (NonGPR8Bit && ModRM.rm >= 0b100 && !HasREX); NonGPR.Data.GPR.GPR = MapModRMToReg(DecodeInst->Flags & DecodeFlags::FLAG_REX_XGPR_B ? 1 : 0, ModRM.rm, NonGPR8Bit, HasREX, HasXMMNonGPR, HasMMNonGPR); if (NonGPR.Data.GPR.GPR == FEXCore::X86State::REG_INVALID) { return false; } } else { // Only decode if we haven't pre-decoded if (NonGPR.IsNone()) { auto Disp = DecodeModRMs_Disp[Has16BitAddressing]; (this->*Disp)(&NonGPR, ModRM); } } return true; }; size_t CurrentSrc = 0; const auto VEXOperand = Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_VEX_SRC_MASK; if (VEXOperand == FEXCore::X86Tables::InstFlags::FLAGS_VEX_NO_OPERAND && Options.vvvv) { return false; } if (VEXOperand == FEXCore::X86Tables::InstFlags::FLAGS_VEX_1ST_SRC) { DecodeInst->Src[CurrentSrc].Type = DecodedOperand::OpType::GPR; DecodeInst->Src[CurrentSrc].Data.GPR.HighBits = false; // If we have XMM flags at all, then SRC 1 cannot be a GPR. The only case where // this is possible is with BMI1 and BMI2 instructions (which are all GPR-based // and don't use XMM flags) DecodeInst->Src[CurrentSrc].Data.GPR.GPR = MapVEXToReg(Options.vvvv, HasXMMFlags); ++CurrentSrc; } if (Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_MODRM) { if (Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SF_MOD_DST) { if (!ModRMOperand(DecodeInst->Src[CurrentSrc], DecodeInst->Dest, HasXMMSrc, HasXMMDst, HasMMSrc, HasMMDst, Is8BitSrc, Is8BitDest)) { return false; } } else { if (!ModRMOperand(DecodeInst->Dest, DecodeInst->Src[CurrentSrc], HasXMMDst, HasXMMSrc, HasMMDst, HasMMSrc, Is8BitDest, Is8BitSrc)) { return false; } } ++CurrentSrc; } if (VEXOperand == FEXCore::X86Tables::InstFlags::FLAGS_VEX_2ND_SRC) { DecodeInst->Src[CurrentSrc].Type = DecodedOperand::OpType::GPR; DecodeInst->Src[CurrentSrc].Data.GPR.HighBits = false; DecodeInst->Src[CurrentSrc].Data.GPR.GPR = MapVEXToReg(Options.vvvv, HasXMMSrc); ++CurrentSrc; } if (HAS_NON_XMM_SUBFLAG(Info->Flags, FEXCore::X86Tables::InstFlags::FLAGS_SF_SRC_RAX)) { DecodeInst->Src[CurrentSrc].Type = DecodedOperand::OpType::GPR; DecodeInst->Src[CurrentSrc].Data.GPR.HighBits = false; DecodeInst->Src[CurrentSrc].Data.GPR.GPR = FEXCore::X86State::REG_RAX; ++CurrentSrc; } else if (HAS_NON_XMM_SUBFLAG(Info->Flags, FEXCore::X86Tables::InstFlags::FLAGS_SF_SRC_RCX)) { DecodeInst->Src[CurrentSrc].Type = DecodedOperand::OpType::GPR; DecodeInst->Src[CurrentSrc].Data.GPR.HighBits = false; DecodeInst->Src[CurrentSrc].Data.GPR.GPR = FEXCore::X86State::REG_RCX; ++CurrentSrc; } if (VEXOperand == FEXCore::X86Tables::InstFlags::FLAGS_VEX_DST) { CurrentDest->Type = DecodedOperand::OpType::GPR; CurrentDest->Data.GPR.HighBits = false; CurrentDest->Data.GPR.GPR = MapVEXToReg(Options.vvvv, HasXMMDst); } if (Bytes != 0) { LOGMAN_THROW_A_FMT(Bytes <= 8, "Number of bytes should be <= 8 for literal src"); auto [Literal, IsRelocation] = ReadData(Bytes); if (IsRelocation) { DecodeInst->Src[CurrentSrc].Type = DecodedOperand::OpType::LiteralRelocation; DecodeInst->Src[CurrentSrc].Data.LiteralRelocation.EntrypointOffset = Literal; } else { DecodeInst->Src[CurrentSrc].Data.Literal.Size = Bytes; if ((Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SRC_SEXT) || (DecodeFlags::GetSizeDstFlags(DecodeInst->Flags) == DecodeFlags::SIZE_64BIT && Info->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SRC_SEXT64BIT)) { if (Bytes == 1) { Literal = static_cast(Literal); } else if (Bytes == 2) { Literal = static_cast(Literal); } else { Literal = static_cast(Literal); } DecodeInst->Src[CurrentSrc].Data.Literal.Size = DestSize; } DecodeInst->Src[CurrentSrc].Type = DecodedOperand::OpType::Literal; DecodeInst->Src[CurrentSrc].Data.Literal.Value = Literal; } Bytes = 0; } LOGMAN_THROW_A_FMT(Bytes == 0, "Inst at 0x{:x}: 0x{:04x} '{}' Had an instruction of size {} with {} remaining", DecodeInst->PC, DecodeInst->OP, DecodeInst->TableInfo->Name ?: "UND", InstructionSize, Bytes); DecodeInst->InstSize = InstructionSize; return true; } bool Decoder::NormalOpHeader(const FEXCore::X86Tables::X86InstInfo* Info, uint16_t Op) { DecodeInst->OPRaw = DecodeInst->OP = Op; DecodeInst->TableInfo = Info; if (Info->Type == FEXCore::X86Tables::TYPE_UNKNOWN) { return false; } if (Info->Type == FEXCore::X86Tables::TYPE_INVALID) { return false; } LOGMAN_THROW_A_FMT(Info->Type != FEXCore::X86Tables::TYPE_REX_PREFIX, "REX PREFIX should have been decoded before this!"); // A normal instruction is the most likely. if (Info->Type == FEXCore::X86Tables::TYPE_INST) [[likely]] { return NormalOp(Info, Op); } else if (Info->Type == FEXCore::X86Tables::TYPE_ARCH_DISPATCHER) [[unlikely]] { // Dispatcher Op. return NormalOp(&Info->OpcodeDispatcher.Indirect[BlockInfo.Is64BitMode ? 1 : 0], Op); } else if (Info->Type >= FEXCore::X86Tables::TYPE_GROUP_1 && Info->Type <= FEXCore::X86Tables::TYPE_GROUP_11) { uint8_t ModRMByte = ReadByte(); DecodeInst->ModRM = ModRMByte; DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_MODRM; FEXCore::X86Tables::ModRMDecoded ModRM; ModRM.Hex = DecodeInst->ModRM; #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_1) << 6) | (prefix) << 3 | (Reg)) Op = OPD(Info->Type, Info->MoreBytes, ModRM.reg); return NormalOp(&PrimaryInstGroupOps[Op], Op); #undef OPD } else if (Info->Type >= FEXCore::X86Tables::TYPE_GROUP_6 && Info->Type <= FEXCore::X86Tables::TYPE_GROUP_P) { #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_6) << 5) | (prefix) << 3 | (Reg)) constexpr uint16_t PF_NONE = 0; constexpr uint16_t PF_F3 = 1; constexpr uint16_t PF_66 = 2; constexpr uint16_t PF_F2 = 3; uint16_t PrefixType = PF_NONE; if (LastEscapePrefix == 0xF3) { PrefixType = PF_F3; } else if (LastEscapePrefix == 0xF2) { PrefixType = PF_F2; } else if (LastEscapePrefix == 0x66) { PrefixType = PF_66; } // We have ModRM uint8_t ModRMByte = ReadByte(); DecodeInst->ModRM = ModRMByte; DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_MODRM; FEXCore::X86Tables::ModRMDecoded ModRM; ModRM.Hex = DecodeInst->ModRM; uint16_t LocalOp = OPD(Info->Type, PrefixType, ModRM.reg); const FEXCore::X86Tables::X86InstInfo* LocalInfo = &SecondInstGroupOps[LocalOp]; #undef OPD if (LocalInfo->Type == FEXCore::X86Tables::TYPE_SECOND_GROUP_MODRM && ModRM.mod == 0b11) { // Everything in this group is privileged instructions aside from XGETBV constexpr std::array RegToField = { 255, 0, 1, 2, 255, 255, 255, 3, }; uint8_t Field = RegToField[ModRM.reg]; if (Field == 255) { return false; } LocalOp = (Field << 3) | ModRM.rm; return NormalOp(&SecondModRMTableOps[LocalOp], LocalOp); } else { return NormalOp(&SecondInstGroupOps[LocalOp], LocalOp); } } else if (Info->Type == FEXCore::X86Tables::TYPE_X87_TABLE_PREFIX) { // We have ModRM uint8_t ModRMByte = ReadByte(); DecodeInst->ModRM = ModRMByte; DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_MODRM; uint16_t X87Op = ((Op - 0xD8) << 8) | ModRMByte; return NormalOp(&(*X87Table)[X87Op], X87Op); } else if (Info->Type == FEXCore::X86Tables::TYPE_VEX_TABLE_PREFIX) { if (!VEXTable) { // AVX not enabled. return false; } uint16_t map_select = 1; uint16_t pp = 0; const uint8_t Byte1 = ReadByte(); DecodedHeader options {}; if ((Byte1 & 0b10000000) == 0) { if (!BlockInfo.Is64BitMode) { return false; } DecodeInst->Flags |= DecodeFlags::FLAG_REX_XGPR_R; } if (Op == 0xC5) { // Two byte VEX pp = Byte1 & 0b11; const uint8_t vvvv = ((Byte1 & 0b01111000) >> 3); if (!BlockInfo.Is64BitMode && vvvv <= 0b0111) { // Invalid on 32-bit, can't use the high registers. return false; } options.vvvv = 15 - vvvv; options.L = (Byte1 & 0b100) != 0; } else { // 0xC4 = Three byte VEX const uint8_t Byte2 = ReadByte(); pp = Byte2 & 0b11; map_select = Byte1 & 0b11111; const uint8_t vvvv = ((Byte2 & 0b01111000) >> 3); if (!BlockInfo.Is64BitMode && vvvv <= 0b0111) { // Invalid on 32-bit, can't use the high registers. return false; } options.vvvv = 15 - vvvv; options.w = (Byte2 & 0b10000000) != 0; options.L = (Byte2 & 0b100) != 0; if ((Byte1 & 0b01000000) == 0) { if (!BlockInfo.Is64BitMode) { return false; } DecodeInst->Flags |= DecodeFlags::FLAG_REX_XGPR_X; } if (BlockInfo.Is64BitMode && (Byte1 & 0b00100000) == 0) { DecodeInst->Flags |= DecodeFlags::FLAG_REX_XGPR_B; } if (options.w) { DecodeInst->Flags |= DecodeFlags::FLAG_OPTION_AVX_W; } if (!(map_select >= 1 && map_select <= 3)) { return false; } } uint16_t VEXOp = ReadByte(); #define OPD(map_select, pp, opcode) (((map_select - 1) << 10) | (pp << 8) | (opcode)) Op = OPD(map_select, pp, VEXOp); #undef OPD const FEXCore::X86Tables::X86InstInfo* LocalInfo = &(*VEXTable)[Op]; if (LocalInfo->Type >= FEXCore::X86Tables::TYPE_VEX_GROUP_12 && LocalInfo->Type <= FEXCore::X86Tables::TYPE_VEX_GROUP_17) { // We have ModRM uint8_t ModRMByte = ReadByte(); DecodeInst->ModRM = ModRMByte; DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_MODRM; FEXCore::X86Tables::ModRMDecoded ModRM; ModRM.Hex = DecodeInst->ModRM; #define OPD(group, pp, opcode) (((group - TYPE_VEX_GROUP_12) << 4) | (pp << 3) | (opcode)) Op = OPD(LocalInfo->Type, pp, ModRM.reg); #undef OPD return NormalOp(&(*VEXTableGroup)[Op], Op, options); } else { return NormalOp(LocalInfo, Op, options); } } else if (Info->Type == FEXCore::X86Tables::TYPE_GROUP_EVEX) { FEXCORE_TELEMETRY_SET(TYPE_USES_EVEX_OPS, 1); // EVEX unsupported return false; } LOGMAN_MSG_A_FMT("Invalid instruction decoding type"); FEX_UNREACHABLE; } bool Decoder::DecodeInstructionImpl(uint64_t PC) { InstructionSize = 0; LastEscapePrefix = 0; Instruction.fill(0); DecodeInst = &DecodedBuffer[DecodedSize]; memset(DecodeInst, 0, sizeof(DecodedInst)); DecodeInst->PC = PC; for (;;) { if (InstructionSize >= MAX_INST_SIZE) { return false; } uint8_t Op = ReadByte(); switch (Op) { case 0x0F: { // Escape Op uint8_t EscapeOp = ReadByte(); switch (EscapeOp) { case 0x0F: [[unlikely]] { // 3DNow! DecodeREXIfValid(-2); // 3DNow! Instruction Encoding: 0F 0F [ModRM] [SIB] [Displacement] [Opcode] // Decode ModRM uint8_t ModRMByte = ReadByte(); DecodeInst->ModRM = ModRMByte; DecodeInst->Flags |= DecodeFlags::FLAG_DECODED_MODRM; FEXCore::X86Tables::ModRMDecoded ModRM; ModRM.Hex = DecodeInst->ModRM; const bool Has16BitAddressing = !BlockInfo.Is64BitMode && DecodeInst->Flags & DecodeFlags::FLAG_ADDRESS_SIZE; // All 3DNow! instructions have the second argument as the rm handler // We need to decode it upfront to get the displacement out of the way if (ModRM.mod != 0b11) { auto Disp = DecodeModRMs_Disp[Has16BitAddressing]; (this->*Disp)(&DecodeInst->Src[0], ModRM); } // Take a peek at the op just past the displacement uint8_t LocalOp = ReadByte(); return NormalOpHeader(&FEXCore::X86Tables::DDDNowOps[LocalOp], LocalOp); break; } case 0x38: { // F38 Table! DecodeREXIfValid(-2); constexpr uint16_t PF_38_NONE = 0; constexpr uint16_t PF_38_66 = (1U << 0); constexpr uint16_t PF_38_F2 = (1U << 1); constexpr uint16_t PF_38_F3 = (1U << 2); uint16_t Prefix = PF_38_NONE; if (DecodeInst->Flags & DecodeFlags::FLAG_OPERAND_SIZE) { Prefix |= PF_38_66; } if (DecodeInst->Flags & DecodeFlags::FLAG_REPNE_PREFIX) { Prefix |= PF_38_F2; } if (DecodeInst->Flags & DecodeFlags::FLAG_REP_PREFIX) { Prefix |= PF_38_F3; } uint16_t LocalOp = (Prefix << 8) | ReadByte(); bool NoOverlay66 = (FEXCore::X86Tables::H0F38TableOps[LocalOp].Flags & InstFlags::FLAGS_NO_OVERLAY66) != 0; if (LastEscapePrefix == 0x66 && NoOverlay66) { // Operand Size // Remove prefix so it doesn't effect calculations. // This is only an escape prefix rather than modifier now DecodeInst->Flags &= ~DecodeFlags::FLAG_OPERAND_SIZE; DecodeFlags::PopOpAddrIf(&DecodeInst->Flags, DecodeFlags::FLAG_OPERAND_SIZE_LAST); } return NormalOpHeader(&FEXCore::X86Tables::H0F38TableOps[LocalOp], LocalOp); break; } case 0x3A: { // F3A Table! DecodeREXIfValid(-2); constexpr uint16_t PF_3A_NONE = 0; constexpr uint16_t PF_3A_66 = (1 << 0); constexpr uint16_t PF_3A_REX = (1 << 1); uint16_t Prefix = PF_3A_NONE; if (LastEscapePrefix == 0x66) { // Operand Size Prefix = PF_3A_66; } if (DecodeInst->Flags & DecodeFlags::FLAG_REX_WIDENING) { Prefix |= PF_3A_REX; } uint16_t LocalOp = (Prefix << 8) | ReadByte(); return NormalOpHeader(&FEXCore::X86Tables::H0F3ATableOps[LocalOp], LocalOp); break; } default: [[likely]] { // Two byte table! // x86-64 abuses three legacy prefixes to extend the table encodings // 0x66 - Operand Size prefix // 0xF2 - REPNE prefix // 0xF3 - REP prefix // If any of these three prefixes are used then it falls down the subtable // Additionally: If you hit repeat of differnt prefixes then only the LAST one before this one works for subtable selection bool NoOverlay = (FEXCore::X86Tables::SecondBaseOps[EscapeOp].Flags & InstFlags::FLAGS_NO_OVERLAY) != 0; bool NoOverlay66 = (FEXCore::X86Tables::SecondBaseOps[EscapeOp].Flags & InstFlags::FLAGS_NO_OVERLAY66) != 0; DecodeREXIfValid(-2); if (NoOverlay) { // This section of the table ignores prefix extention return NormalOpHeader(&FEXCore::X86Tables::SecondBaseOps[EscapeOp], EscapeOp); } else if (LastEscapePrefix == 0xF3) { // REP // Remove prefix so it doesn't effect calculations. // This is only an escape prefix rather tan modifier now DecodeInst->Flags &= ~DecodeFlags::FLAG_REP_PREFIX; return NormalOpHeader(&FEXCore::X86Tables::RepModOps[EscapeOp], EscapeOp); } else if (LastEscapePrefix == 0xF2) { // REPNE // Remove prefix so it doesn't effect calculations. // This is only an escape prefix rather tan modifier now DecodeInst->Flags &= ~DecodeFlags::FLAG_REPNE_PREFIX; return NormalOpHeader(&FEXCore::X86Tables::RepNEModOps[EscapeOp], EscapeOp); } else if (LastEscapePrefix == 0x66 && !NoOverlay66) { // Operand Size // Remove prefix so it doesn't effect calculations. // This is only an escape prefix rather tan modifier now DecodeInst->Flags &= ~DecodeFlags::FLAG_OPERAND_SIZE; DecodeFlags::PopOpAddrIf(&DecodeInst->Flags, DecodeFlags::FLAG_OPERAND_SIZE_LAST); return NormalOpHeader(&FEXCore::X86Tables::OpSizeModOps[EscapeOp], EscapeOp); } else { return NormalOpHeader(&FEXCore::X86Tables::SecondBaseOps[EscapeOp], EscapeOp); } break; } } break; } case 0x66: // Operand Size prefix DecodeInst->Flags |= DecodeFlags::FLAG_OPERAND_SIZE; LastEscapePrefix = Op; DecodeFlags::PushOpAddr(&DecodeInst->Flags, DecodeFlags::FLAG_OPERAND_SIZE_LAST); break; case 0x67: // Address Size override prefix DecodeInst->Flags |= DecodeFlags::FLAG_ADDRESS_SIZE; break; case 0x26: // ES legacy prefix if (!BlockInfo.Is64BitMode) { DecodeInst->Flags = (DecodeInst->Flags & ~FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS) | DecodeFlags::FLAG_ES_PREFIX; } break; case 0x2E: // CS legacy prefix if (!BlockInfo.Is64BitMode) { DecodeInst->Flags = (DecodeInst->Flags & ~FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS) | DecodeFlags::FLAG_CS_PREFIX; } break; case 0x36: // SS legacy prefix if (!BlockInfo.Is64BitMode) { DecodeInst->Flags = (DecodeInst->Flags & ~FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS) | DecodeFlags::FLAG_SS_PREFIX; } break; case 0x3E: // DS legacy prefix if (!BlockInfo.Is64BitMode) { DecodeInst->Flags = (DecodeInst->Flags & ~FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS) | DecodeFlags::FLAG_DS_PREFIX; } break; case 0xF0: // LOCK prefix DecodeInst->Flags |= DecodeFlags::FLAG_LOCK; break; case 0xF2: // REPNE prefix DecodeInst->Flags |= DecodeFlags::FLAG_REPNE_PREFIX; LastEscapePrefix = Op; break; case 0xF3: // REP prefix DecodeInst->Flags |= DecodeFlags::FLAG_REP_PREFIX; LastEscapePrefix = Op; break; case 0x64: // FS prefix DecodeInst->Flags = (DecodeInst->Flags & ~FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS) | DecodeFlags::FLAG_FS_PREFIX; break; case 0x65: // GS prefix DecodeInst->Flags = (DecodeInst->Flags & ~FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS) | DecodeFlags::FLAG_GS_PREFIX; break; default: [[likely]] { // Default base table const X86InstInfo* Info = &FEXCore::X86Tables::BaseOps[Op]; if (Info->Type == FEXCore::X86Tables::TYPE_ARCH_DISPATCHER) { Info = &Info->OpcodeDispatcher.Indirect[BlockInfo.Is64BitMode ? 1 : 0]; } if (Info->Type == FEXCore::X86Tables::TYPE_REX_PREFIX) { DecodeInst->REXIndex = InstructionSize; } else { DecodeREXIfValid(); return NormalOpHeader(Info, Op); } break; } } } if (DecodeInst->Dest.IsGPR()) { return false; } return true; } void Decoder::DecodeREXIfValid(int8_t ExpectedOffset) { LOGMAN_THROW_A_FMT(ExpectedOffset < 0, "Expecting an negative offset for the REX offset!"); const int8_t REXIndex = InstructionSize + ExpectedOffset; if (DecodeInst->REXIndex != 0 && DecodeInst->REXIndex == REXIndex) { const uint8_t Op = Instruction[REXIndex - 1]; DecodeInst->Flags |= DecodeFlags::FLAG_REX_PREFIX; // Widening displacement if (Op & 0b1000) { DecodeInst->Flags |= DecodeFlags::FLAG_REX_WIDENING; DecodeFlags::PushOpAddr(&DecodeInst->Flags, DecodeFlags::FLAG_WIDENING_SIZE_LAST); } // XGPR_B bit set if (Op & 0b0001) { DecodeInst->Flags |= DecodeFlags::FLAG_REX_XGPR_B; } // XGPR_X bit set if (Op & 0b0010) { DecodeInst->Flags |= DecodeFlags::FLAG_REX_XGPR_X; } // XGPR_R bit set if (Op & 0b0100) { DecodeInst->Flags |= DecodeFlags::FLAG_REX_XGPR_R; } } } Decoder::DecodedBlockStatus Decoder::DecodeInstruction(uint64_t PC) { // Will be set if DecodeInstructionImpl tries to read non-executable memory HitNonExecutableRange = false; HitBadRelocation = false; bool ErrorDuringDecoding = !DecodeInstructionImpl(PC); if (ErrorDuringDecoding || HitNonExecutableRange || HitBadRelocation) [[unlikely]] { // Put an invalid instruction in the stream so the core can raise SIGILL if hit // Error while decoding instruction. We don't know the table or instruction size DecodeInst->TableInfo = nullptr; auto Result = ErrorDuringDecoding ? DecodedBlockStatus::INVALID_INST : DecodeInst->InstSize ? DecodedBlockStatus::PARTIAL_DECODE_INST : HitNonExecutableRange ? DecodedBlockStatus::NOEXEC_INST : DecodedBlockStatus::BAD_RELOCATION; DecodeInst->InstSize = 0; return Result; } else if (!DecodeInst->TableInfo || (DecodeInst->TableInfo->Type == TYPE_INST && !DecodeInst->TableInfo->OpcodeDispatcher.OpDispatch)) { // If there wasn't an error during decoding but we have no dispatcher for the instruction then claim invalid instruction. return DecodedBlockStatus::INVALID_INST; } if (CTX->AreMonoHacksActive()) { // Unity uses a standard SPSC ringbuffer with cached read/write pointers and thread waiting flags at the following // offsets, which are consistent between 32-bit and 64-bit Unity versions from 2015 onwards. auto IsKnownAtomicDisplacement = [](uint64_t Displacement) { return Displacement == 0x80 || Displacement == 0x84 || Displacement == 0xC0 || Displacement == 0xC4; }; if (DecodeInst->OP == 0x8b && DecodeInst->Src[0].IsGPRIndirect() && IsKnownAtomicDisplacement(DecodeInst->Src[0].Data.GPRIndirect.Displacement)) { DecodeInst->Flags |= X86Tables::DecodeFlags::FLAG_FORCE_TSO; } if (DecodeInst->OP == 0x89 && DecodeInst->Dest.IsGPRIndirect() && IsKnownAtomicDisplacement(DecodeInst->Dest.Data.GPRIndirect.Displacement)) { DecodeInst->Flags |= X86Tables::DecodeFlags::FLAG_FORCE_TSO; } } return DecodedBlockStatus::SUCCESS; } void Decoder::BranchTargetInMultiblockRange() { if (!CTX->Config.Multiblock) { return; } // If the RIP setting is conditional AND within our symbol range then it can be considered for multiblock uint64_t TargetRIP = 0; const auto GPRSize = GetGPROpSize(); bool Conditional = true; const auto InstEnd = DecodeInst->PC + DecodeInst->InstSize; if (DecodeInst->TableInfo->Flags & FEXCore::X86Tables::InstFlags::FLAGS_CALL) { if (ExecutableRangeWritable && CTX->AreMonoHacksActive()) { // Mono generated code often contains noreturn calls with garbage following them, and calls are always backpatched // after CIL compilation leading to n recompiles for a multiblock with n calls. Choose to minimize stutters over // raw performance and disable tracking past calls for mono generated code. return; } AddBranchTarget(InstEnd); BlockInfo.EntryPoints.emplace(InstEnd); return; } // Calls are handled above switch (DecodeInst->OP) { case 0x70 ... 0x7F: // Conditional JUMP case 0x80 ... 0x8F: { // More conditional // Source is a literal // auto RIPOffset = LoadSource(Op, Op->Src[0], Op->Flags); // auto RIPTargetConst = Constant(Op->PC + Op->InstSize); // Target offset is PC + InstSize + Literal TargetRIP = InstEnd + DecodeInst->Src[0].Literal(); break; } case 0xE9: case 0xEB: // Both are unconditional JMP instructions TargetRIP = InstEnd + DecodeInst->Src[0].Literal(); Conditional = false; break; case 0xC2: // RET imm case 0xC3: // RET default: return; break; } if (GPRSize == IR::OpSize::i32Bit) { // If we are running a 32bit guest then wrap around addresses that go above 32bit TargetRIP &= 0xFFFFFFFFU; } if (Conditional) { // If we are conditional then a target can be the instruction past the conditional instruction AddBranchTarget(InstEnd); } // If the target RIP is x86 code within the symbol ranges then we are golden // Forbid distant branches to have the cost code better match the guest code layout, avoiding massive (range-wise) code // blocks in highly fragmented guest code. Such branches are often not-taken branches to garbage in obfuscated code. constexpr uint64_t MAX_FORWARD_BRANCH_DIST = FEXCore::Utils::FEX_PAGE_SIZE * 4; bool ValidMultiblockMember = TargetRIP >= EntryPoint && TargetRIP < std::min(InstEnd + MAX_FORWARD_BRANCH_DIST, SectionMaxAddress); #ifdef ARCHITECTURE_arm64ec ValidMultiblockMember = ValidMultiblockMember && !RtlIsEcCode(TargetRIP); #endif if (ValidMultiblockMember) { // Update our conditional branch ranges before we return if (Conditional) { MaxCondBranchForward = std::max(MaxCondBranchForward, TargetRIP); MaxCondBranchBackwards = std::min(MaxCondBranchBackwards, TargetRIP); } AddBranchTarget(TargetRIP); } else { if (ExternalBranches) { ExternalBranches->insert(TargetRIP); } } } bool Decoder::IsBranchMonoTailcall(uint64_t NumInstructions) const { // While the mono call backpatching block can easily be detected due it being the only one to contain SMC-faulting // atomics, that can't be said for the tailcall jump backpatcher which has changed several times across versions and // can be partially inlined. To work around this, instead detect the tailcall site itself and force full non-signal-based // SMC detection for that single block. if (!ExecutableRangeWritable) { // We only care about jitted code return false; } // See mini-{amd64,x86}.c in the mono codebase, specifically where METHOD_JUMP patches are emitted. if (GetGPROpSize() == IR::OpSize::i32Bit) { // Matches: // LEAVE // / NOP / MOV EAX, EAX / LEA EBP, [EBP+0] // JMP imm32 if (DecodeInst->OP != 0xE9 || NumInstructions < 2) { return false; } auto PrevInst = std::prev(DecodeInst); if (PrevInst->OP == 0xC9) { return true; } if (NumInstructions < 3 || std::prev(PrevInst)->OP != 0xC9) { return false; } return PrevInst->OP == 0x90 || (PrevInst->OP == 0x8B && PrevInst->ModRM == 0xC0) || (PrevInst->OP == 0x8D && PrevInst->ModRM == 0x6D && PrevInst->Src[1].IsLiteral() && PrevInst->Src[1].Literal() == 0); } else { FEXCore::X86Tables::ModRMDecoded ModRM; ModRM.Hex = DecodeInst->ModRM; if (DecodeInst->OPRaw == 0xFF && ModRM.reg == 4 && DecodeInst->Src[0].IsGPR()) { if (DecodeInst->Src[0].Data.GPR.GPR == FEXCore::X86State::REG_RAX) { // Found in versions of mono from 2024 onwards - matches: // REX.W JMP rax return (DecodeInst->Flags & (DecodeFlags::FLAG_REX_PREFIX | DecodeFlags::FLAG_REX_WIDENING | DecodeFlags::FLAG_REX_XGPR_B | DecodeFlags::FLAG_REX_XGPR_X | DecodeFlags::FLAG_REX_XGPR_R)) == (DecodeFlags::FLAG_REX_PREFIX | DecodeFlags::FLAG_REX_WIDENING); } else if (NumInstructions > 1 && DecodeInst->Src[0].Data.GPR.GPR == FEXCore::X86State::REG_R11) { // Found in older versions of mono - match: // MOV r11, imm64 // JMP r11 auto PrevInst = std::prev(DecodeInst); return PrevInst->OP == 0xBB && PrevInst->Dest.IsGPR() && PrevInst->Dest.Data.GPR.GPR == FEXCore::X86State::REG_R11; } } } return false; } bool Decoder::InstCanContinue() const { if (DecodeInst->PC + DecodeInst->InstSize == NextBlockStartAddress) { return false; } if (!(DecodeInst->TableInfo->Flags & (FEXCore::X86Tables::InstFlags::FLAGS_BLOCK_END | FEXCore::X86Tables::InstFlags::FLAGS_SETS_RIP))) { return true; } uint64_t TargetRIP = 0; const auto GPRSize = GetGPROpSize(); if (DecodeInst->OP == 0xE8) { // Call - immediate target const uint64_t NextRIP = DecodeInst->PC + DecodeInst->InstSize; TargetRIP = DecodeInst->PC + DecodeInst->InstSize + DecodeInst->Src[0].Literal(); if (GPRSize == IR::OpSize::i32Bit) { // If we are running a 32bit guest then wrap around addresses that go above 32bit TargetRIP &= 0xFFFFFFFFU; } if (TargetRIP == NextRIP) { // Optimize the case that the instruction is jumping just after itself. // This is a GOT calculation which we can optimize out. // Optimization occurs inside of the OpDispatcher implementation return true; } } return false; } void Decoder::AddBranchTarget(uint64_t Target) { if (VisitedBlocks.contains(Target)) { return; } auto BlockSuccIt = std::lower_bound(BlockInfo.Blocks.begin(), BlockInfo.Blocks.end(), Target, [](const auto& a, uint64_t Address) { return a.Entry < Address; }); LOGMAN_THROW_A_FMT(BlockSuccIt == BlockInfo.Blocks.end() || BlockSuccIt->Entry != Target, "unexpected"); if (BlockSuccIt != BlockInfo.Blocks.begin()) { auto BlockIt = std::prev(BlockSuccIt); if (BlockIt->Entry + BlockIt->Size > Target) { uint64_t SplitIdx = 0; uint64_t SplitAddr = BlockIt->Entry; // Find the instruction boundary of the split for (; SplitIdx < BlockIt->NumInstructions && SplitAddr < Target; SplitIdx++) { SplitAddr += BlockIt->DecodedInstructions[SplitIdx].InstSize; } uint64_t SplitOffset = SplitAddr - BlockIt->Entry; LOGMAN_THROW_A_FMT(SplitIdx != 0, "unexpected"); if (SplitAddr == Target) { // Split at the boundary DecodedBlocks SplitBlock { .Entry = SplitAddr, .Size = BlockIt->Size - SplitOffset, .NumInstructions = BlockIt->NumInstructions - SplitIdx, .DecodedInstructions = BlockIt->DecodedInstructions + SplitIdx, .BlockStatus = BlockIt->BlockStatus, }; BlockIt->Size = SplitOffset; BlockIt->NumInstructions = SplitIdx; BlockInfo.Blocks.insert(BlockSuccIt, SplitBlock); } // else misaligned, leave as a branch out of the block // If we split a block then the target has already been visited as part of that, if it was // misaligned the jump will just leave the multiblock, mark it as visited to avoid running // this code path again and just bail out early. VisitedBlocks.insert(Target); return; } } CurrentBlockTargets.insert(Target); if (Target >= DecodeInst->PC + DecodeInst->InstSize && Target < NextBlockStartAddress) { NextBlockStartAddress = Target; } } const uint8_t* Decoder::AdjustAddrForSpecialRegion(const uint8_t* _InstStream, uint64_t EntryPoint, uint64_t RIP) { constexpr uint64_t VSyscall_Base = 0xFFFF'FFFF'FF60'0000ULL; constexpr uint64_t VSyscall_End = VSyscall_Base + 0x1000; if (OSABI == FEXCore::HLE::SyscallOSABI::OS_LINUX64 && RIP >= VSyscall_Base && RIP < VSyscall_End) { // VSyscall // This doesn't exist on AArch64 and on x86_64 hosts this is emulated with faults to a region mapped with --xp permissions // Offset 0: vgettimeofday // Offset 0x400: vtime // Offset 0x800: vgetcpu uint64_t Offset = RIP - VSyscall_Base; return VSyscallData + Offset; } return _InstStream - EntryPoint + RIP; } bool Decoder::CheckIfCacheable(FEXCore::Core::InternalThreadState& Thread, const uint8_t* InstStream, uint64_t PC, uint64_t MaxInst) { DecodeInstructionsAtEntry(&Thread, InstStream, PC, MaxInst); bool Uncacheable = HitBadRelocation; DelayedDisownBuffer(); return !Uncacheable; } void Decoder::DecodeInstructionsAtEntry(FEXCore::Core::InternalThreadState* Thread, const uint8_t* _InstStream, uint64_t PC, uint64_t MaxInst) { FEXCORE_PROFILE_SCOPED("DecodeInstructions"); BlockInfo.TotalInstructionCount = 0; BlockInfo.Blocks.clear(); VisitedBlocks.clear(); // Reset internal state management DecodedSize = 0; MaxCondBranchForward = 0; MaxCondBranchBackwards = ~0ULL; DecodedBuffer = PoolObject.ReownOrClaimBuffer(); // Decode operating mode from thread's CS segment. const auto CSSegment = Core::CPUState::GetSegmentFromIndex(Thread->CurrentFrame->State, Thread->CurrentFrame->State.cs_idx); BlockInfo.Is64BitMode = CSSegment->L == 1; LOGMAN_THROW_A_FMT(BlockInfo.Is64BitMode == CTX->Config.Is64BitMode, "Expected operating mode to not change at runtime!"); EntryPoint = PC; BlockInfo.EntryPoints = {PC}; InstStream = _InstStream; uint64_t TotalInstructions {}; SectionMinAddress = 0; SectionMaxAddress = ~0ULL; Relocations = nullptr; if (CTX->GetCodeCache().IsGeneratingCache || EnableCodeCacheValidation) { // If generating cache, attempt to load section bounds and relocations if (auto SectionInfo = CTX->SyscallHandler->LookupExecutableFileSection(Thread, EntryPoint)) { SectionMinAddress = SectionInfo->FileStartVA; SectionMaxAddress = SectionInfo->EndVA; Relocations = &SectionInfo->FileInfo.Relocations; } } DecodedMinAddress = EntryPoint; DecodedMaxAddress = EntryPoint; // Entry is a jump target BlocksToDecode = {PC}; uint64_t CurrentCodePage = PC & FEXCore::Utils::FEX_PAGE_MASK; BlockInfo.CodePages = {CurrentCodePage}; if (MaxInst == 0) { MaxInst = CTX->Config.MaxInstPerBlock; } bool EntryBlock {true}; bool FinalInstruction {false}; while (!FinalInstruction && !BlocksToDecode.empty()) { auto BlockDecodeIt = BlocksToDecode.begin(); uint64_t RIPToDecode = *BlockDecodeIt; BlocksToDecode.erase(BlockDecodeIt); VisitedBlocks.emplace(RIPToDecode); auto BlockSuccIt = std::lower_bound(BlockInfo.Blocks.begin(), BlockInfo.Blocks.end(), RIPToDecode, [](const auto& a, uint64_t Address) { return a.Entry < Address; }); LOGMAN_THROW_A_FMT(BlockSuccIt == BlockInfo.Blocks.end() || BlockSuccIt->Entry != RIPToDecode, "unexpected"); NextBlockStartAddress = ~0ULL; if (!BlocksToDecode.empty()) { // We just erased the lowest, the front is then the second lowest NextBlockStartAddress = *BlocksToDecode.begin(); } if (BlockSuccIt != BlockInfo.Blocks.end() && BlockSuccIt->Entry < NextBlockStartAddress) { NextBlockStartAddress = BlockSuccIt->Entry; } LOGMAN_THROW_A_FMT(NextBlockStartAddress > RIPToDecode, "unexpected"); // Insert the block now so it can be looked up and split if necessary on a backward edge auto BlockIt = BlockInfo.Blocks.emplace(BlockSuccIt); BlockIt->Entry = RIPToDecode; BlockIt->Size = 0; BlockIt->IsEntryPoint = EntryBlock; uint64_t PCOffset = 0; uint64_t BlockStartOffset = DecodedSize; bool EraseBlock = true; // Unset once the block contains an instruction BlockIt->DecodedInstructions = &DecodedBuffer[BlockStartOffset]; BlockIt->NumInstructions = 0; // Do a bit of pointer math to figure out where we are in code InstStream = AdjustAddrForSpecialRegion(_InstStream, EntryPoint, RIPToDecode); while (1) { InstructionSize = 0; // MAX_INST_SIZE assumes worst case auto OpAddress = RIPToDecode + PCOffset; auto OpMaxAddress = OpAddress + MAX_INST_SIZE; auto OpMinPage = OpAddress & FEXCore::Utils::FEX_PAGE_MASK; auto OpMaxPage = OpMaxAddress & FEXCore::Utils::FEX_PAGE_MASK; if (!EntryBlock && OpMinPage == OpMaxPage && PeekByte(0).value_or(0) == 0 && PeekByte(1).value_or(0) == 0) [[unlikely]] { // End the multiblock early if we hit 2 consecutive null bytes (add [rax], al) in the same page with the // assumption we are most likely trying to explore garbage code. break; } if (OpMinPage != CurrentCodePage) { CurrentCodePage = OpMinPage; BlockInfo.CodePages.insert(CurrentCodePage); } if (OpMaxPage != CurrentCodePage) { CurrentCodePage = OpMaxPage; BlockInfo.CodePages.insert(CurrentCodePage); } BlockIt->BlockStatus = DecodeInstruction(OpAddress); if (HitBadRelocation) { BlockInfo.TotalInstructionCount = 0; BlockInfo.Blocks = {*BlockIt}; BlockInfo.EntryPoints.clear(); BlockInfo.CodePages.clear(); return; } uint64_t OpEndAddress = OpAddress + DecodeInst->InstSize; DecodedMinAddress = std::min(DecodedMinAddress, OpAddress); DecodedMaxAddress = std::max(DecodedMaxAddress, OpEndAddress); if (OpEndAddress > NextBlockStartAddress) { // This instruction would overlap with another so skip adding it to the multiblock break; } EraseBlock = false; // Block contains at least one valid instruction, so unset erase ++TotalInstructions; ++DecodedSize; ++BlockIt->NumInstructions; BlockIt->Size += DecodeInst->InstSize; // Can not continue this block at all on invalid instruction if (BlockIt->BlockStatus != DecodedBlockStatus::SUCCESS) [[unlikely]] { if (!EntryBlock && BlockIt->BlockStatus != DecodedBlockStatus::BAD_RELOCATION) { // In multiblock configurations, we can early terminate any non-entrypoint blocks with the expectation that this won't get hit. // Improves compile-times. // Just need to undo additions that this block decoding has caused. TotalInstructions -= BlockIt->NumInstructions; DecodedSize = BlockStartOffset; InstStream -= PCOffset; EraseBlock = true; } else { LogMan::Msg::EFmt("{} instruction in entry block: {:X}", BlockIt->BlockStatus == DecodedBlockStatus::INVALID_INST ? "Invalid" : BlockIt->BlockStatus == DecodedBlockStatus::NOEXEC_INST ? "NoExec" : BlockIt->BlockStatus == DecodedBlockStatus::BAD_RELOCATION ? "BadRelocation" : "PartialDecode", OpAddress); } break; } // Check if we need to end the entire multiblock FinalInstruction = DecodedSize >= MaxInst || DecodedSize >= DefaultDecodedBufferSize || TotalInstructions >= MaxInst; if (FinalInstruction) { break; } if (!InstCanContinue()) { if (DecodeInst->TableInfo->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SETS_RIP) { // If we have multiblock enabled // If the branch target is within our multiblock range then we can keep going on // We don't want to short circuit this since we want to calculate our ranges still // NOTE: This will invalidate BlockIt, this is fine as we immediately break from the loop and EraseBlock cannot be true BlockIt->ForceFullSMCDetection = CTX->AreMonoHacksActive() && IsBranchMonoTailcall(BlockIt->NumInstructions); BranchTargetInMultiblockRange(); } break; } PCOffset += DecodeInst->InstSize; InstStream += DecodeInst->InstSize; } // NOTE: BlockIt is only valid here in the EraseBlock case if (EraseBlock) { BlockInfo.Blocks.erase(BlockIt); } else { BlocksToDecode.merge(CurrentBlockTargets); } CurrentBlockTargets.clear(); EntryBlock = false; } BlockInfo.TotalInstructionCount = TotalInstructions; for (auto& Block : BlockInfo.Blocks) { Block.IsEntryPoint = BlockInfo.EntryPoints.contains(Block.Entry); } } } // namespace FEXCore::Frontend ================================================ FILE: FEXCore/Source/Interface/Core/Frontend.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/IR/IR.h" #include #include #include #include #include #include #include #include #include #include namespace FEXCore::Context { class ContextImpl; } namespace FEXCore::HLE { enum class SyscallOSABI; } namespace FEXCore::Frontend { class Decoder final { public: enum class DecodedBlockStatus { SUCCESS, INVALID_INST, NOEXEC_INST, PARTIAL_DECODE_INST, BAD_RELOCATION, }; // New Frontend decoding struct DecodedBlocks final { uint64_t Entry {}; uint64_t Size {}; uint64_t NumInstructions {}; FEXCore::X86Tables::DecodedInst* DecodedInstructions; DecodedBlockStatus BlockStatus; bool IsEntryPoint {}; bool ForceFullSMCDetection {}; }; struct DecodedBlockInformation final { uint64_t TotalInstructionCount; bool Is64BitMode {}; fextl::vector Blocks; fextl::set EntryPoints; fextl::set CodePages; // Start addresses of all pages touching the block }; Decoder(FEXCore::Core::InternalThreadState* Thread); bool CheckIfCacheable(FEXCore::Core::InternalThreadState&, const uint8_t* InstStream, uint64_t PC, uint64_t MaxInst); void DecodeInstructionsAtEntry(FEXCore::Core::InternalThreadState* Thread, const uint8_t* InstStream, uint64_t PC, uint64_t MaxInst); const DecodedBlockInformation* GetDecodedBlockInfo() const { return &BlockInfo; } uint64_t DecodedMinAddress {}; uint64_t DecodedMaxAddress {~0ULL}; void SetExternalBranches(fextl::set* v) { ExternalBranches = v; } void DelayedDisownBuffer() { PoolObject.DelayedDisownBuffer(); } void ResetExecutableRangeCache() { ExecutableRangeBase = ExecutableRangeEnd = 0; } private: // To pass any information from instruction prefixes // down into the actual instruction handling machinery. struct DecodedHeader { uint8_t vvvv; // Encoded operand in a VEX prefix. bool w; // VEX.W bit. bool L; // VEX.L bit (if set then 256 bit operation, if unset then scalar or 128-bit operation) }; FEXCore::Core::InternalThreadState* Thread; FEXCore::Context::ContextImpl* CTX; const FEXCore::HLE::SyscallOSABI OSABI {}; FEX_CONFIG_OPT(EnableCodeCacheValidation, ENABLECODECACHEVALIDATION); bool DecodeInstructionImpl(uint64_t PC); DecodedBlockStatus DecodeInstruction(uint64_t PC); void BranchTargetInMultiblockRange(); bool IsBranchMonoTailcall(uint64_t NumInstructions) const; bool InstCanContinue() const; void AddBranchTarget(uint64_t Target); bool CheckRangeExecutable(uint64_t Address, uint64_t Size); uint8_t ReadByte(); std::optional PeekByte(uint8_t Offset); std::pair ReadData(uint8_t Size); void SkipBytes(uint8_t Size) { InstructionSize += Size; } bool NormalOp(const FEXCore::X86Tables::X86InstInfo* Info, uint16_t Op, DecodedHeader Options = {}); bool NormalOpHeader(const FEXCore::X86Tables::X86InstInfo* Info, uint16_t Op); void DecodeREXIfValid(int8_t ExpectedOffset = -1); static constexpr size_t DefaultDecodedBufferSize = 0x10000; FEXCore::X86Tables::DecodedInst* DecodedBuffer {}; Utils::PoolBufferWithTimedRetirement PoolObject; size_t DecodedSize {}; uint64_t ExecutableRangeBase {}; uint64_t ExecutableRangeEnd {}; bool ExecutableRangeWritable {}; bool HitNonExecutableRange {}; bool HitBadRelocation {}; const uint8_t* InstStream {}; IR::OpSize GetGPROpSize() const { return BlockInfo.Is64BitMode ? IR::OpSize::i64Bit : IR::OpSize::i32Bit; } static constexpr size_t MAX_INST_SIZE = 15; uint8_t InstructionSize {}; std::array Instruction; uint8_t LastEscapePrefix {}; FEXCore::X86Tables::DecodedInst* DecodeInst; // This is for multiblock data tracking uint64_t EntryPoint {}; uint64_t MaxCondBranchForward {}; uint64_t MaxCondBranchBackwards {~0ULL}; uint64_t SectionMaxAddress {~0ULL}; uint64_t SectionMinAddress {}; uint64_t NextBlockStartAddress {~0ULL}; DecodedBlockInformation BlockInfo; fextl::set CurrentBlockTargets; fextl::set BlocksToDecode; fextl::set VisitedBlocks; fextl::set* ExternalBranches {nullptr}; const fextl::robin_map* Relocations {nullptr}; // ModRM rm decoding using DecodeModRMPtr = void (FEXCore::Frontend::Decoder::*)(X86Tables::DecodedOperand* Operand, X86Tables::ModRMDecoded ModRM); void DecodeModRM_16(X86Tables::DecodedOperand* Operand, X86Tables::ModRMDecoded ModRM); void DecodeModRM_64(X86Tables::DecodedOperand* Operand, X86Tables::ModRMDecoded ModRM); static constexpr std::array DecodeModRMs_Disp { &FEXCore::Frontend::Decoder::DecodeModRM_64, &FEXCore::Frontend::Decoder::DecodeModRM_16, }; const std::array* X87Table; const std::array* VEXTable {}; const std::array* VEXTableGroup {}; const uint8_t* AdjustAddrForSpecialRegion(const uint8_t* _InstStream, uint64_t EntryPoint, uint64_t RIP); }; } // namespace FEXCore::Frontend ================================================ FILE: FEXCore/Source/Interface/Core/Interpreter/Fallbacks/F80Fallbacks.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Common/SoftFloat.h" #include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h" #include "Interface/IR/IR.h" #include #include namespace FEXCore::CPU { FEXCORE_PRESERVE_ALL_ATTR static softfloat_state SoftFloatStateFromFCW(uint16_t FCW, bool Force80BitPrecision = false) { softfloat_state State {}; State.detectTininess = softfloat_tininess_afterRounding; State.exceptionFlags = 0; State.roundingPrecision = 80; if (!Force80BitPrecision) { auto PC = (FCW >> 8) & 3; switch (PC) { case 0: State.roundingPrecision = 32; break; case 2: State.roundingPrecision = 64; break; case 3: State.roundingPrecision = 80; break; case 1: LOGMAN_MSG_A_FMT("Invalid x87 precision mode, {}", PC); } } auto RC = (FCW >> 10) & 3; switch (RC) { case 0: State.roundingMode = softfloat_round_near_even; break; case 1: State.roundingMode = softfloat_round_min; break; case 2: State.roundingMode = softfloat_round_max; break; case 3: State.roundingMode = softfloat_round_minMag; break; } return State; } FEXCORE_PRESERVE_ALL_ATTR static void HandleX87Exception(const softfloat_state& State, FEXCore::Core::CpuStateFrame* Frame) { // Check for Invalid Operation exception (bit 0 of X87 status word) if (State.exceptionFlags & softfloat_flag_invalid) { Frame->State.flags[FEXCore::X86State::X87FLAG_IE_LOC] = 1; } } // Wrapper for SoftFloat state to handle X87 exceptions class ScopedSoftFloatState { public: FEXCORE_PRESERVE_ALL_ATTR ScopedSoftFloatState(uint16_t FCW, FEXCore::Core::CpuStateFrame* Frame, bool Force80BitPrecision = false) : State(SoftFloatStateFromFCW(FCW, Force80BitPrecision)) , Frame(Frame) {} FEXCORE_PRESERVE_ALL_ATTR ~ScopedSoftFloatState() { HandleX87Exception(State, Frame); } // Disable copy and move to ensure RAII semantics ScopedSoftFloatState(const ScopedSoftFloatState&) = delete; ScopedSoftFloatState& operator=(const ScopedSoftFloatState&) = delete; ScopedSoftFloatState(ScopedSoftFloatState&&) = delete; ScopedSoftFloatState& operator=(ScopedSoftFloatState&&) = delete; softfloat_state State; private: FEXCore::Core::CpuStateFrame* Frame; }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle4(uint16_t FCW, float src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(&State.State, src); } FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle8(uint16_t FCW, double src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(&State.State, src); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static uint64_t handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; bool eq, lt, nan; uint64_t ResultFlags = 0; X80SoftFloat::FCMP(&State.State, Src1, Src2, &eq, <, &nan); if (lt) { ResultFlags |= (1 << IR::FCMP_FLAG_LT); } if (nan) { ResultFlags |= (1 << IR::FCMP_FLAG_UNORDERED); } if (eq) { ResultFlags |= (1 << IR::FCMP_FLAG_EQ); } return ResultFlags; } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static float handle4(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(src).ToF32(&State.State); } FEXCORE_PRESERVE_ALL_ATTR static double handle8(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(src).ToF64(&State.State); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static int16_t handle2(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(src).ToI16(&State.State); } FEXCORE_PRESERVE_ALL_ATTR static int32_t handle4(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(src).ToI32(&State.State); } FEXCORE_PRESERVE_ALL_ATTR static int64_t handle8(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat(src).ToI64(&State.State); } FEXCORE_PRESERVE_ALL_ATTR static int16_t handle2t(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; auto rv = extF80_to_i32(&State.State, X80SoftFloat(src), softfloat_round_minMag, false); if (rv > INT16_MAX || rv < INT16_MIN) { ///< Indefinite value for 16-bit conversions. return INT16_MIN; } else { return rv; } } FEXCORE_PRESERVE_ALL_ATTR static int32_t handle4t(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return extF80_to_i32(&State.State, X80SoftFloat(src), softfloat_round_minMag, false); } FEXCORE_PRESERVE_ALL_ATTR static int64_t handle8t(uint16_t FCW, VectorRegType src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return extF80_to_i64(&State.State, X80SoftFloat(src), softfloat_round_minMag, false); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle2(uint16_t FCW, int16_t src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return X80SoftFloat(src); } FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle4(uint16_t FCW, int32_t src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return X80SoftFloat(src); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FRNDINT(&State.State, Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::F2XM1(&State.State, Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FTAN(&State.State, Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat::FSQRT(&State.State, Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FSIN(&State.State, Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FCOS(&State.State, Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegPairType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return FEXCore::MakeVectorRegPair(X80SoftFloat::FSIN(&State.State, Src1), X80SoftFloat::FCOS(&State.State, Src1)); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return X80SoftFloat::FXTRACT_EXP(Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return X80SoftFloat::FXTRACT_SIG(Src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat::FADD(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat::FSUB(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat::FMUL(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame}; return X80SoftFloat::FDIV(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FYL2X(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FATAN(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FREM1(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FREM(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1, VectorRegType Src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); ScopedSoftFloatState State {FCW, Frame, true}; return X80SoftFloat::FSCALE(&State.State, Src1, Src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return sin(src); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return cos(src); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorScalarF64Pair handle(double src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); double sin, cos; #ifdef _WIN32 sin = ::sin(src); cos = ::cos(src); #else sincos(src, &sin, &cos); #endif return VectorScalarF64Pair {sin, cos}; } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return tan(src); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return exp2(src) - 1.0; } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src1, double src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return atan2(src1, src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src1, double src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return fmod(src1, src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src1, double src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return remainder(src1, src2); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src1, double src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); return src2 * log2(src1); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static double handle(double src1, double src2, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); if (src1 == 0.0) { // src1 might be +/- zero return src1; // this will return negative or positive zero if when appropriate } double trun = trunc(src2); return src1 * exp2(trun); } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src1q, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); X80SoftFloat Src1 = Src1q; ScopedSoftFloatState State {FCW, Frame}; bool Negative = Src1.Top.Sign; Src1 = X80SoftFloat::FRNDINT(&State.State, Src1); // Clear the Sign bit Src1.Top.Sign = 0; uint64_t Tmp = Src1.ToI64(&State.State); X80SoftFloat Rv; uint8_t* BCD = reinterpret_cast(&Rv); for (size_t i = 0; i < 9; ++i) { if (Tmp == 0) { // Nothing left? Just leave break; } // Extract the lower 100 values uint8_t Digit = Tmp % 100; // Now divide it for the next iteration Tmp /= 100; uint8_t UpperNibble = Digit / 10; uint8_t LowerNibble = Digit % 10; // Now store the BCD BCD[i] = (UpperNibble << 4) | LowerNibble; } // Set negative flag once converted to x87 BCD[9] = Negative ? 0x80 : 0; return Rv; } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static VectorRegType handle(uint16_t FCW, VectorRegType Src, FEXCore::Core::CpuStateFrame* Frame) { FEXCORE_PROFILE_INSTANT_INCREMENT(Frame->Thread, AccumulatedFloatFallbackCount, 1); uint8_t* Src1 = reinterpret_cast(&Src); uint64_t BCD {}; // We walk through each uint8_t and pull out the BCD encoding // Each 4bit split is a digit // Only 0-9 is supported, A-F results in undefined data // | 4 bit | 4 bit | // | 10s place | 1s place | // EG 0x48 = 48 // EG 0x4847 = 4847 // This gives us an 18digit value encoded in BCD // The last byte lets us know if it negative or not for (size_t i = 0; i < 9; ++i) { uint8_t Digit = Src1[8 - i]; // First shift our last value over BCD *= 100; // Add the tens place digit BCD += (Digit >> 4) * 10; // Add the ones place digit BCD += Digit & 0xF; } // Set negative flag once converted to x87 bool Negative = Src1[9] & 0x80; X80SoftFloat Tmp; Tmp = BCD; Tmp.Top.Sign = Negative; return Tmp; } }; } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include namespace FEXCore::IR { enum IROps : uint16_t; } namespace FEXCore::CPU { // Base template for fallback handling. // // Registering and hooking up fallback is currently like so: // // 1. Go to InterpreterFallbacks.cpp and create a template specialization of // the GetFallbackInfo member function. // // This member function should reasonably define what the fallback you're // going to create will take as parameters and return as a result. For example: // // template<> // FallbackInfo GetFallbackInfo(X80SoftFloat(*fn)(double), Core::FallbackHandlerIndex Index) { // return {FABI_F80_F64, (void*)fn, Index}; // } // // Defines info about a fallback that takes a double as an argument and // returns a X80SoftFloat instance. // // You will also want to define a new FallbackHandlerIndex enum member and use it // to set up the new info handler into the Info array in FillFallbackIndexPointers. // // 1.1. (potentially optional). Define a new ABI element in the FallbackAPI enum. // This ABI enum value will be used to tell the JITs how to handle the fallback // properly. These enum values specify the return type followed by its argument types. // // So, FABI_I64_F80_F80, for example indicates that the function will behave like a // function as if were defined as: // // uint64_t fn(X80SoftFloat, X80SoftFloat) // // 1.2. (potentially optional). If you needed to define a new enum ABI type like in 1.1, then // you need to add the handling for it in the JITs, which can be found in the respective // JIT's JIT.cpp file in a function called Op_Unhandled // // You need to add a new case to the ABI switch statement using the new ABI type // and do the necessary moving of data from register-allocated JIT parameters // into that platform's registers that respects the calling convention. After this is // done, most of the necessary background boilerplate is finished. // // 2. Now, make a specialization of this class with a member function named 'handle()' // that takes the same parameters as the ones described in the fallback info function // specialization. // // For example, if you have the fallback info from the example in step 1, it would be: // // template <> // struct OpHandlers { // static X80SoftFloat handle(double src) { // return ...; // } // }; // // 3. Fill out the behavior of the OpHandler specialization to perform what you would like // the fallback to do. // // 4. Add an implementation of the IR op to the Interpreter that passes through to the // OpHandler implementation. // // 5. Done. // template struct OpHandlers {}; } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp ================================================ // SPDX-License-Identifier: MIT #include #include "Interface/Core/Interpreter/InterpreterOps.h" #include "Interface/Core/Interpreter/Fallbacks/F80Fallbacks.h" #include "Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h" #include #include namespace FEXCore::CPU { template static FallbackInfo GetFallbackInfo(R (*fn)(Args...), FEXCore::Core::FallbackHandlerIndex HandlerIndex) { return {FABI_UNKNOWN, HandlerIndex}; } void InterpreterOps::FillFallbackIndexPointers(Core::FallbackABIInfo* Info, uint64_t* ABIHandlers) { Info[Core::OPINDEX_F80CVTTO_4] = {ABIHandlers[FABI_F80_I16_F32_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle4)}; Info[Core::OPINDEX_F80CVTTO_8] = {ABIHandlers[FABI_F80_I16_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle8)}; Info[Core::OPINDEX_F80CVT_4] = {ABIHandlers[FABI_F32_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle4)}; Info[Core::OPINDEX_F80CVT_8] = {ABIHandlers[FABI_F64_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle8)}; Info[Core::OPINDEX_F80CVTINT_2] = {ABIHandlers[FABI_I16_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle2)}; Info[Core::OPINDEX_F80CVTINT_4] = {ABIHandlers[FABI_I32_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle4)}; Info[Core::OPINDEX_F80CVTINT_8] = {ABIHandlers[FABI_I64_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle8)}; Info[Core::OPINDEX_F80CVTINT_TRUNC2] = {ABIHandlers[FABI_I16_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle2t)}; Info[Core::OPINDEX_F80CVTINT_TRUNC4] = {ABIHandlers[FABI_I32_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle4t)}; Info[Core::OPINDEX_F80CVTINT_TRUNC8] = {ABIHandlers[FABI_I64_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle8t)}; Info[Core::OPINDEX_F80CMP] = {ABIHandlers[FABI_I64_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80CVTTOINT_2] = {ABIHandlers[FABI_F80_I16_I16_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle2)}; Info[Core::OPINDEX_F80CVTTOINT_4] = {ABIHandlers[FABI_F80_I16_I32_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle4)}; // Unary Info[Core::OPINDEX_F80ROUND] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80F2XM1] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80TAN] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80SQRT] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80SIN] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80COS] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80SINCOS] = {ABIHandlers[FABI_F80x2_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80XTRACT_EXP] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80XTRACT_SIG] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80BCDSTORE] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80BCDLOAD] = {ABIHandlers[FABI_F80_I16_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; // Binary Info[Core::OPINDEX_F80ADD] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80SUB] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80MUL] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80DIV] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80FYL2X] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80ATAN] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80FPREM1] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80FPREM] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F80SCALE] = {ABIHandlers[FABI_F80_I16_F80_F80_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; // Double Precision Unary Info[Core::OPINDEX_F64SIN] = {ABIHandlers[FABI_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64COS] = {ABIHandlers[FABI_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64SINCOS] = {ABIHandlers[FABI_F64x2_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64TAN] = {ABIHandlers[FABI_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64F2XM1] = {ABIHandlers[FABI_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; // Double Precision Binary Info[Core::OPINDEX_F64ATAN] = {ABIHandlers[FABI_F64_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64FPREM] = {ABIHandlers[FABI_F64_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64FPREM1] = {ABIHandlers[FABI_F64_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64FYL2X] = {ABIHandlers[FABI_F64_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_F64SCALE] = {ABIHandlers[FABI_F64_F64_F64_PTR], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; // SSE4.2 string instructions Info[Core::OPINDEX_VPCMPESTRX] = {ABIHandlers[FABI_I32_I64_I64_V128_V128_I16], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; Info[Core::OPINDEX_VPCMPISTRX] = {ABIHandlers[FABI_I32_V128_V128_I16], reinterpret_cast(&FEXCore::CPU::OpHandlers::handle)}; } bool InterpreterOps::GetFallbackHandler(const IR::IROp_Header* IROp, FallbackInfo* Info) { const auto OpSize = IROp->Size; switch (IROp->Op) { case IR::OP_F80CVTTO: { auto Op = IROp->C(); switch (Op->SrcSize) { case IR::OpSize::i32Bit: { *Info = {FABI_F80_I16_F32_PTR, Core::OPINDEX_F80CVTTO_4}; return true; } case IR::OpSize::i64Bit: { *Info = {FABI_F80_I16_F64_PTR, Core::OPINDEX_F80CVTTO_8}; return true; } default: LogMan::Msg::DFmt("Unhandled size: {}", OpSize); } break; } case IR::OP_F80CVT: { switch (OpSize) { case IR::OpSize::i32Bit: { *Info = {FABI_F32_I16_F80_PTR, Core::OPINDEX_F80CVT_4}; return true; } case IR::OpSize::i64Bit: { *Info = {FABI_F64_I16_F80_PTR, Core::OPINDEX_F80CVT_8}; return true; } default: LogMan::Msg::DFmt("Unhandled size: {}", OpSize); } break; } case IR::OP_F80CVTINT: { auto Op = IROp->C(); switch (OpSize) { case IR::OpSize::i16Bit: { if (Op->Truncate) { *Info = {FABI_I16_I16_F80_PTR, Core::OPINDEX_F80CVTINT_TRUNC2}; } else { *Info = {FABI_I16_I16_F80_PTR, Core::OPINDEX_F80CVTINT_2}; } return true; } case IR::OpSize::i32Bit: { if (Op->Truncate) { *Info = {FABI_I32_I16_F80_PTR, Core::OPINDEX_F80CVTINT_TRUNC4}; } else { *Info = {FABI_I32_I16_F80_PTR, Core::OPINDEX_F80CVTINT_4}; } return true; } case IR::OpSize::i64Bit: { if (Op->Truncate) { *Info = {FABI_I64_I16_F80_PTR, Core::OPINDEX_F80CVTINT_TRUNC8}; } else { *Info = {FABI_I64_I16_F80_PTR, Core::OPINDEX_F80CVTINT_8}; } return true; } default: LogMan::Msg::DFmt("Unhandled size: {}", OpSize); } break; } case IR::OP_F80CMP: { *Info = {FABI_I64_I16_F80_F80_PTR, (Core::FallbackHandlerIndex)(Core::OPINDEX_F80CMP)}; return true; } case IR::OP_F80CVTTOINT: { auto Op = IROp->C(); switch (Op->SrcSize) { case IR::OpSize::i16Bit: { *Info = {FABI_F80_I16_I16_PTR, Core::OPINDEX_F80CVTTOINT_2}; return true; } case IR::OpSize::i32Bit: { *Info = {FABI_F80_I16_I32_PTR, Core::OPINDEX_F80CVTTOINT_4}; return true; } default: LogMan::Msg::DFmt("Unhandled size: {}", OpSize); } break; } #define COMMON_UNARY_X87_OP(OP) \ case IR::OP_F80##OP: { \ *Info = {FABI_F80_I16_F80_PTR, Core::OPINDEX_F80##OP}; \ return true; \ } #define COMMON_UNARYPAIR_X87_OP(OP) \ case IR::OP_F80##OP: { \ *Info = {FABI_F80x2_I16_F80_PTR, Core::OPINDEX_F80##OP}; \ return true; \ } #define COMMON_BINARY_X87_OP(OP) \ case IR::OP_F80##OP: { \ *Info = {FABI_F80_I16_F80_F80_PTR, Core::OPINDEX_F80##OP}; \ return true; \ } #define COMMON_F64_OP(OP) \ case IR::OP_F64##OP: { \ *Info = GetFallbackInfo(&FEXCore::CPU::OpHandlers::handle, Core::OPINDEX_F64##OP); \ return true; \ } #define COMMON_UNARY_F64_OP(OP) \ case IR::OP_F64##OP: { \ *Info = {FABI_F64_F64_PTR, Core::OPINDEX_F64##OP}; \ return true; \ } #define COMMON_UNARYPAIR_F64_OP(OP) \ case IR::OP_F64##OP: { \ *Info = {FABI_F64x2_F64_PTR, Core::OPINDEX_F64##OP}; \ return true; \ } #define COMMON_BINARY_F64_OP(OP) \ case IR::OP_F64##OP: { \ *Info = {FABI_F64_F64_F64_PTR, Core::OPINDEX_F64##OP}; \ return true; \ } // Unary COMMON_UNARY_X87_OP(ROUND) COMMON_UNARY_X87_OP(F2XM1) COMMON_UNARY_X87_OP(TAN) COMMON_UNARY_X87_OP(SQRT) COMMON_UNARY_X87_OP(SIN) COMMON_UNARY_X87_OP(COS) COMMON_UNARYPAIR_X87_OP(SINCOS) COMMON_UNARY_X87_OP(XTRACT_EXP) COMMON_UNARY_X87_OP(XTRACT_SIG) COMMON_UNARY_X87_OP(BCDSTORE) COMMON_UNARY_X87_OP(BCDLOAD) // Binary COMMON_BINARY_X87_OP(ADD) COMMON_BINARY_X87_OP(SUB) COMMON_BINARY_X87_OP(MUL) COMMON_BINARY_X87_OP(DIV) COMMON_BINARY_X87_OP(FYL2X) COMMON_BINARY_X87_OP(ATAN) COMMON_BINARY_X87_OP(FPREM1) COMMON_BINARY_X87_OP(FPREM) COMMON_BINARY_X87_OP(SCALE) // Double Precision Unary COMMON_UNARY_F64_OP(F2XM1) COMMON_UNARY_F64_OP(TAN) COMMON_UNARY_F64_OP(SIN) COMMON_UNARY_F64_OP(COS) COMMON_UNARYPAIR_F64_OP(SINCOS) // Double Precision Binary COMMON_BINARY_F64_OP(FYL2X) COMMON_BINARY_F64_OP(ATAN) COMMON_BINARY_F64_OP(FPREM1) COMMON_BINARY_F64_OP(FPREM) COMMON_BINARY_F64_OP(SCALE) // SSE4.2 Fallbacks case IR::OP_VPCMPESTRX: *Info = {FABI_I32_I64_I64_V128_V128_I16, Core::OPINDEX_VPCMPESTRX}; return true; case IR::OP_VPCMPISTRX: *Info = {FABI_I32_V128_V128_I16, Core::OPINDEX_VPCMPISTRX}; return true; default: break; } return false; } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Interpreter/Fallbacks/StringCompareFallbacks.cpp ================================================ // SPDX-License-Identifier: MIT #include "Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h" #include "Interface/IR/IR.h" #ifdef ARCHITECTURE_arm64 #include #endif #include namespace FEXCore::CPU { #ifdef ARCHITECTURE_arm64 FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) { const auto is_using_words = (control & 1) != 0; if (is_using_words) { uint16x8_t a = vreinterpretq_u16_u8(data); uint16x8_t VIndexes {}; const uint16x8_t VIndex16 = vdupq_n_u16(8); uint16_t Indexes[8] = { 0, 1, 2, 3, 4, 5, 6, 7, }; memcpy(&VIndexes, Indexes, sizeof(VIndexes)); auto MaskResult = vceqzq_u16(a); auto SelectResult = vbslq_u16(MaskResult, VIndexes, VIndex16); return vminvq_u16(SelectResult); } else { uint8x16_t VIndexes {}; const uint8x16_t VIndex16 = vdupq_n_u8(16); uint8_t Indexes[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; memcpy(&VIndexes, Indexes, sizeof(VIndexes)); auto MaskResult = vceqzq_u8(data); auto SelectResult = vbslq_u8(MaskResult, VIndexes, VIndex16); return vminvq_u8(SelectResult); } } #else FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) { const auto* data_u8 = reinterpret_cast(&data); const auto is_using_words = (control & 1) != 0; int32_t length = 0; if (is_using_words) { const auto get_word = [data_u8](int32_t index) { const auto* src = data_u8 + (index * sizeof(uint16_t)); uint16_t element {}; std::memcpy(&element, src, sizeof(uint16_t)); return element; }; while (length < 8 && get_word(length) != 0) { length++; } } else { while (length < 16 && data_u8[length] != 0) { length++; } } return length; } #endif // Essentially the same in terms of behavior with VPCMPESTRX instructions, // with the only difference being that the length of the string is encoded // as part of the data vectors passed in. // // i.e. Length is determined by the presence of a NUL (all-zero) character // within the data. // // If no NUL character exists, then the length of the strings are assumed // to be the max length possible for the given character size specified // in the control flags (16 characters for 8-bit, and 8 characters for 16-bit). // FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers::handle(FEXCore::VectorRegType lhs, FEXCore::VectorRegType rhs, uint16_t control) { // Subtract by 1 in order to make validity limits 0-based const auto valid_lhs = GetImplicitLength(lhs, control) - 1; const auto valid_rhs = GetImplicitLength(rhs, control) - 1; __uint128_t lhs_i; memcpy(&lhs_i, &lhs, sizeof(lhs_i)); __uint128_t rhs_i; memcpy(&rhs_i, &rhs, sizeof(rhs_i)); return OpHandlers::MainBody(lhs_i, valid_lhs, rhs_i, valid_rhs, control); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h" #include "Interface/IR/IR.h" #include "Common/VectorRegType.h" namespace FEXCore::CPU { template<> struct OpHandlers { enum class AggregationOp { EqualAny = 0b00, Ranges = 0b01, EqualEach = 0b10, EqualOrdered = 0b11, }; enum class SourceData { U8, U16, S8, S16, }; enum class Polarity { Positive, Negative, PositiveMasked, NegativeMasked, }; FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(uint64_t RAX, uint64_t RDX, VectorRegType lhs_v, VectorRegType rhs_v, uint16_t control) { __uint128_t lhs; memcpy(&lhs, &lhs_v, sizeof(lhs)); __uint128_t rhs; memcpy(&rhs, &rhs_v, sizeof(rhs)); // Subtract by 1 in order to make validity limits 0-based const auto valid_lhs = GetExplicitLength(RAX, control) - 1; const auto valid_rhs = GetExplicitLength(RDX, control) - 1; return MainBody(lhs, valid_lhs, rhs, valid_rhs, control); } // Main PCMPXSTRX algorithm body. Allows for reuse with both implicit and explicit length variants. FEXCORE_PRESERVE_ALL_ATTR static uint32_t MainBody(const __uint128_t& lhs, int valid_lhs, const __uint128_t& rhs, int valid_rhs, uint16_t control) { const uint32_t aggregation = PerformAggregation(lhs, valid_lhs, rhs, valid_rhs, control); const int32_t upper_limit = (16 >> (control & 1)) - 1; // Bits are arranged as: // Bit #: 3 2 1 0 // [SF | ZF | CF | OF] uint32_t flags = 0; flags |= (valid_rhs < upper_limit) ? 0b0100 : 0b0000; flags |= (valid_lhs < upper_limit) ? 0b1000 : 0b0000; const uint32_t result = HandlePolarity(aggregation, control, upper_limit, valid_rhs); if (result != 0) { flags |= 0b0010; } if ((result & 1) != 0) { flags |= 0b0001; } // We track the flags in the usual NZCV bit position so we can msr them // later. Avoids handling flags natively in JIT. return result | (flags << 28); } FEXCORE_PRESERVE_ALL_ATTR static int32_t GetExplicitLength(uint64_t reg, uint16_t control) { // Bit 8 controls whether or not the reg value is 64-bit or 32-bit. int64_t value = 0; if (((control >> 8) & 1) != 0) { value = static_cast(reg); } else { // We need a sign extend in this case. value = static_cast(reg); } // If control[0] is set, then we're dealing with words instead of bytes const int64_t limit = (control & 1) != 0 ? 8 : 16; // Length needs to saturate to 16 (if bytes) or 8 (if words) // when the length value is greater than 16 (if bytes)/8 (if words) // or if the length value is less than -16 (if bytes)/-8 (if words). if (value < -limit || value > limit) { return limit; } return std::abs(static_cast(value)); } FEXCORE_PRESERVE_ALL_ATTR static int32_t GetElement(const __uint128_t& vec, int32_t index, uint16_t control) { const auto* vec_ptr = reinterpret_cast(&vec); // Control bits [1:0] define the data type being dealt with. switch (static_cast(control & 0b11)) { case SourceData::U8: return static_cast(vec_ptr[index]); case SourceData::U16: { uint16_t value {}; std::memcpy(&value, vec_ptr + (sizeof(uint16_t) * static_cast(index)), sizeof(value)); return value; } case SourceData::S8: return static_cast(vec_ptr[index]); case SourceData::S16: default: { int16_t value {}; std::memcpy(&value, vec_ptr + (sizeof(int16_t) * static_cast(index)), sizeof(value)); return value; } } } FEXCORE_PRESERVE_ALL_ATTR static uint32_t PerformAggregation(const __uint128_t& lhs, int32_t valid_lhs, const __uint128_t& rhs, int32_t valid_rhs, uint16_t control) { switch (static_cast((control >> 2) & 0b11)) { case AggregationOp::EqualAny: return HandleEqualAny(lhs, valid_lhs, rhs, valid_rhs, control); case AggregationOp::Ranges: return HandleRanges(lhs, valid_lhs, rhs, valid_rhs, control); case AggregationOp::EqualEach: return HandleEqualEach(lhs, valid_lhs, rhs, valid_rhs, control); case AggregationOp::EqualOrdered: default: return HandleEqualOrdered(lhs, valid_lhs, rhs, valid_rhs, control); } } FEXCORE_PRESERVE_ALL_ATTR static uint32_t HandlePolarity(uint32_t value, uint16_t control, int upper_limit, int valid_rhs) { switch (static_cast((control >> 4) & 0b11)) { case Polarity::Negative: return value ^ ((2U << upper_limit) - 1); case Polarity::NegativeMasked: return value ^ ((1U << (valid_rhs + 1)) - 1); case Polarity::Positive: case Polarity::PositiveMasked: default: // Both positive masking and positive polarity are documented // as both being equivalent to "IntRes2 = IntRes1", where IntRes1 // is our 'value' parameter, so we don't need to do anything in // these cases except return the same value. return value; } } // Finds characters from an overall character set. // // Scans through RHS trying to find any characters contained in LHS. // Think of this as a sort of vectorized version of strspn (kind of). // // e.g. Assume operating on two character vectors as unsigned words // // 0 1 2 3 4 5 6 7 // LHS -> [a, b, c, d, e, f, g, n] // RHS -> [z, k, v, c, d, o, p, n] // // With both explicit lengths for each string being 8 (the max length for words), // this would result in an intermediate result like: // // 0b1001'1000 // │ │ │ // 'n' match ───┘ │ │ // │ │ // 'd' match ──────┘ │ // │ // 'c' match ────────┘ // FEXCORE_PRESERVE_ALL_ATTR static uint32_t HandleEqualAny(const __uint128_t& lhs, int32_t valid_lhs, const __uint128_t& rhs, int32_t valid_rhs, uint16_t control) { uint32_t result = 0; for (int j = valid_rhs; j >= 0; j--) { result <<= 1; const int rhs_value = GetElement(rhs, j, control); for (int i = valid_lhs; i >= 0; i--) { const int lhs_value = GetElement(lhs, i, control); result |= static_cast(rhs_value == lhs_value); } } return result; } // Determines if a character falls within a limited range // // Scans through rhs using a range denoted by two elements // in lhs and determines if the respective character in rhs // falls within its range. // // i.e. // lhs_upper_bound >= rhs_value && lhs_lower_bound <= rhs_value // // e.g. Assume operating on two character vectors as unsigned words // // 0 1 2 3 4 5 6 7 // LHS -> [a, z, A, Z, 0, 0, 0, 0] // RHS -> [z, k, ., C, M, ;, \, '] // // With LHS's length being 4 and RHS's lenth being 8, // this would result in an intermediate result like: // // 0b0001'1011 // │ │ ││ // 'z' >= 'M' && 'a' <= 'M' ─────┘ │ ││ // │ ││ // 'z' >= 'C' && 'a' <= 'C' ───────┘ ││ // ││ // 'Z' >= 'k' && 'A' <= 'k' ─────────┘│ // │ // 'Z' >= 'z' && 'A' <= 'z' ──────────┘ // FEXCORE_PRESERVE_ALL_ATTR static uint32_t HandleRanges(const __uint128_t& lhs, int32_t valid_lhs, const __uint128_t& rhs, int32_t valid_rhs, uint16_t control) { uint32_t result = 0; for (int j = valid_rhs; j >= 0; j--) { result <<= 1; const int element = GetElement(rhs, j, control); for (int i = (valid_lhs - 1) | 1; i >= 0; i -= 2) { const int upper_bound = GetElement(lhs, i - 0, control); const int lower_bound = GetElement(lhs, i - 1, control); const bool ge = upper_bound >= element; const bool le = lower_bound <= element; result |= static_cast(ge && le); } } return result; } // Determines if each character is equal to one another (string compare) // // Essentially the PCMPXSTRX variant of memcmp/strcmp. Sets the bit of the // resulting mask if both elements are equal to one another. Otherwise // sets it to false. // // e.g. Assume operating on two character vectors as unsigned words // // 0 1 2 3 4 5 6 7 // LHS -> [a, b, c, d, e, f, g, n] // RHS -> [a, b, c, d, e, f, e, x] // // With both explicit lengths for each string being 8 (the max length for words), // this would result in an intermediate result like: // // 0b0011'1111 // ││ ││││ // 'f' == 'f' ────┘│ ││││ // │ ││││ // 'e' == 'e' ─────┘ ││││ // ││││ // 'd' == 'd' ───────┘│││ // │││ // 'c' == 'c' ────────┘││ // ││ // 'b' == 'b' ─────────┘│ // │ // 'a' == 'a' ──────────┘ // FEXCORE_PRESERVE_ALL_ATTR static uint32_t HandleEqualEach(const __uint128_t& lhs, int32_t valid_lhs, const __uint128_t& rhs, int32_t valid_rhs, uint16_t control) { const auto upper_limit = (16 >> (control & 1)) - 1; const auto max_valid = std::max(valid_lhs, valid_rhs); const auto min_valid = std::min(valid_lhs, valid_rhs); // All values past the end of string must be forced to true. // (See 4.1.6 Valid/Invalid Override of Comparisons in the Intel Software Development Manual) // So we can calculate this part of the mask ahead of time and set all those to-be bits to true // and then progressively shift them into place over the course of execution. uint32_t result = (1U << (upper_limit - max_valid)) - 1; result <<= (max_valid - min_valid); for (int i = min_valid; i >= 0; i--) { const int lhs_element = GetElement(lhs, i, control); const int rhs_element = GetElement(rhs, i, control); result <<= 1; result |= static_cast(lhs_element == rhs_element); } return result; } // Determines if a substring exists within an overall string // // Somewhat equivalent to the behavior of strstr. // // Sets the corresponding index in the result where a substring is found. // // e.g. Assume operating on two character vectors as unsigned words // // 0 1 2 3 4 5 6 7 // LHS -> [b, a, x, z, y, v, o, m] // RHS -> [b, a, d, b, a, n, k, s] // // With the length of LHS being 2 and the length of RHS being 8, we have a composition like: // // Substring to look for // ┌──┴──┐ // LHS -> [b, a, x, z, y, v, o, m] // RHS -> [b, a, d, b, a, n, k, s] // └───────────┬────────────┘ // Entire string to search // // And we end up with a result like: // // 0b0000'1001 // │ │ // At index 3 ───────┘ │ // │ // At index 0 ──────────┘ // FEXCORE_PRESERVE_ALL_ATTR static uint32_t HandleEqualOrdered(const __uint128_t& lhs, int32_t valid_lhs, const __uint128_t& rhs, int32_t valid_rhs, uint16_t control) { const auto upper_limit = (16 >> (control & 1)) - 1; // Edge case! // If we have *no* valid characters in our inner string, then // we need to return the intermediate result as // 0xFF (if operating on words) or 0xFFFF (if operating on bytes) if (valid_lhs == -1) { return (2U << upper_limit) - 1; } uint32_t result = 0; const int initial = valid_rhs == upper_limit ? valid_rhs : valid_rhs - valid_lhs; for (int j = initial; j >= 0; j--) { result <<= 1; uint32_t value = 1; const int start = std::min(valid_rhs - j, valid_lhs); for (int i = start; i >= 0; i--) { const int lhs_value = GetElement(lhs, i + 0, control); const int rhs_value = GetElement(rhs, i + j, control); value &= static_cast(lhs_value == rhs_value); } result |= value; } return result; } }; template<> struct OpHandlers { FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(VectorRegType lhs, VectorRegType rhs, uint16_t control); }; } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include namespace FEXCore::IR { class IRListView; struct IROp_Header; } // namespace FEXCore::IR namespace FEXCore::CPU { enum FallbackABI { FABI_F80_I16_F32_PTR, FABI_F80_I16_F64_PTR, FABI_F80_I16_I16_PTR, FABI_F80_I16_I32_PTR, FABI_F32_I16_F80_PTR, FABI_F64_I16_F80_PTR, FABI_F64_F64_PTR, FABI_F64_F64_F64_PTR, FABI_I16_I16_F80_PTR, FABI_I32_I16_F80_PTR, FABI_I64_I16_F80_PTR, FABI_I64_I16_F80_F80_PTR, FABI_F80_I16_F80_PTR, FABI_F80_I16_F80_F80_PTR, FABI_F80x2_I16_F80_PTR, FABI_F64x2_F64_PTR, FABI_I32_I64_I64_V128_V128_I16, FABI_I32_V128_V128_I16, FABI_UNKNOWN, }; struct FallbackInfo { FallbackABI ABI; FEXCore::Core::FallbackHandlerIndex HandlerIndex; }; class InterpreterOps { public: static void FillFallbackIndexPointers(Core::FallbackABIInfo* Info, uint64_t* ABIHandlers); static bool GetFallbackHandler(const IR::IROp_Header* IROp, FallbackInfo* Info); }; } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/ALUOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "CodeEmitter/Emitter.h" #include "FEXCore/IR/IR.h" #include "Interface/Context/Context.h" #include "Interface/Core/JIT/JITClass.h" #include "Interface/IR/Passes/RegisterAllocationPass.h" namespace FEXCore::CPU { #define GRD(Node) (IROp->Size <= 4 ? GetDst(Node) : GetDst(Node)) #define GRS(Node) (IROp->Size <= 4 ? GetReg(Node) : GetReg(Node)) #define DEF_BINOP_WITH_CONSTANT(FEXOp, VarOp, ConstOp) \ DEF_OP(FEXOp) { \ auto Op = IROp->C(); \ \ uint64_t Const; \ if (IsInlineConstant(Op->Src2, &Const)) { \ ConstOp(ConvertSize(IROp), GetReg(Node), GetReg(Op->Src1), Const); \ } else { \ VarOp(ConvertSize(IROp), GetReg(Node), GetZeroableReg(Op->Src1), GetReg(Op->Src2)); \ } \ } DEF_BINOP_WITH_CONSTANT(Add, add, add) DEF_BINOP_WITH_CONSTANT(Sub, sub, sub) DEF_BINOP_WITH_CONSTANT(AddWithFlags, adds, adds) DEF_BINOP_WITH_CONSTANT(SubWithFlags, subs, subs) DEF_BINOP_WITH_CONSTANT(Or, orr, orr) DEF_BINOP_WITH_CONSTANT(And, and_, and_) DEF_BINOP_WITH_CONSTANT(Andn, bic, bic) DEF_BINOP_WITH_CONSTANT(Xor, eor, eor) DEF_BINOP_WITH_CONSTANT(Lshl, lslv, lsl) DEF_BINOP_WITH_CONSTANT(Lshr, lsrv, lsr) DEF_BINOP_WITH_CONSTANT(Ror, rorv, ror) DEF_OP(Constant) { auto Op = IROp->C(); auto Dst = GetReg(Node); const auto PadType = [Pad = Op->Pad]() { switch (Pad) { case IR::ConstPad::NoPad: return CPU::Arm64Emitter::PadType::NOPAD; case IR::ConstPad::DoPad: return CPU::Arm64Emitter::PadType::DOPAD; default: return CPU::Arm64Emitter::PadType::AUTOPAD; } }(); LoadConstant(ARMEmitter::Size::i64Bit, Dst, Op->Constant, PadType, Op->MaxBytes); } DEF_OP(EntrypointOffset) { auto Op = IROp->C(); auto Constant = Entry + Op->Offset; uint64_t Mask = ~0ULL; const auto OpSize = IROp->Size; if (OpSize == IR::OpSize::i32Bit) { Mask = 0xFFFF'FFFFULL; } InsertGuestRIPMove(GetReg(Node), Constant & Mask); } DEF_OP(InlineConstant) { // nop } DEF_OP(InlineEntrypointOffset) { // nop } DEF_OP(CycleCounter) { auto Op = IROp->C(); if (CTX->HostFeatures.SupportsECV && Op->SelfSynchronizingLoads) { // CNTVCTSS_EL0 is "self-synchronizing", which means loads can't speculate past this instruction. // Stores still aren't synchronized although. mrs(GetReg(Node), ARMEmitter::SystemRegister::CNTVCTSS_EL0); } else { if (Op->SelfSynchronizingLoads) { // If ECV isn't supported then an ISB must be emitted to synchronize all instructions and loads before the cycle read. isb(); } mrs(GetReg(Node), ARMEmitter::SystemRegister::CNTVCT_EL0); } } DEF_OP(AddShift) { auto Op = IROp->C(); add(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2), ConvertIRShiftType(Op->Shift), Op->ShiftAmount); } DEF_OP(AddNZCV) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); auto Src1 = GetReg(Op->Src1); uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { LOGMAN_THROW_A_FMT(IROp->Size >= IR::OpSize::i32Bit, "Constant not allowed here"); cmn(EmitSize, Src1, Const); } else if (IROp->Size < IR::OpSize::i32Bit) { unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size); lsl(ARMEmitter::Size::i32Bit, TMP1, Src1, Shift); cmn(EmitSize, TMP1, GetReg(Op->Src2), ARMEmitter::ShiftType::LSL, Shift); } else { cmn(EmitSize, Src1, GetReg(Op->Src2)); } } DEF_OP(AdcNZCV) { auto Op = IROp->C(); adcs(ConvertSize48(IROp), ARMEmitter::Reg::zr, GetReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(AdcWithFlags) { auto Op = IROp->C(); adcs(ConvertSize48(IROp), GetReg(Node), GetZeroableReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(AdcZeroWithFlags) { auto Op = IROp->C(); auto Size = ConvertSize48(IROp); cset(Size, TMP1, ARMEmitter::Condition::CC_CC); adds(Size, GetReg(Node), GetReg(Op->Src1), TMP1); } DEF_OP(AdcZero) { auto Op = IROp->C(); auto Size = ConvertSize48(IROp); cinc(Size, GetReg(Node), GetReg(Op->Src1), ARMEmitter::Condition::CC_CC); } DEF_OP(Adc) { auto Op = IROp->C(); adc(ConvertSize48(IROp), GetReg(Node), GetZeroableReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(SbbWithFlags) { auto Op = IROp->C(); sbcs(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(SbbNZCV) { auto Op = IROp->C(); sbcs(ConvertSize48(IROp), ARMEmitter::Reg::zr, GetReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(Sbb) { auto Op = IROp->C(); sbc(ConvertSize48(IROp), GetReg(Node), GetZeroableReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(TestNZ) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); uint64_t Const; auto Src1 = GetReg(Op->Src1); // Shift the sign bit into place, clearing out the garbage in upper bits. // Adding zero does an effective test, setting NZ according to the result and // zeroing CV. if (IROp->Size < IR::OpSize::i32Bit) { // Cheaper to and+cmn than to lsl+lsl+tst, so do the and ourselves if // needed. if (Op->Src1 != Op->Src2) { if (IsInlineConstant(Op->Src2, &Const)) { and_(EmitSize, TMP1, Src1, Const); } else { auto Src2 = GetReg(Op->Src2); and_(EmitSize, TMP1, Src1, Src2); } Src1 = TMP1; } unsigned Shift = 32 - IR::OpSizeAsBits(IROp->Size); cmn(EmitSize, ARMEmitter::Reg::zr, Src1, ARMEmitter::ShiftType::LSL, Shift); } else { if (IsInlineConstant(Op->Src2, &Const)) { tst(EmitSize, Src1, Const); } else { const auto Src2 = GetReg(Op->Src2); tst(EmitSize, Src1, Src2); } } } DEF_OP(TestZ) { auto Op = IROp->C(); LOGMAN_THROW_A_FMT(IROp->Size < IR::OpSize::i32Bit, "TestNZ used at higher sizes"); const auto EmitSize = ARMEmitter::Size::i32Bit; uint64_t Const; uint64_t Mask = IROp->Size == IR::OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(IROp->Size)) - 1); auto Src1 = GetReg(Op->Src1); if (IsInlineConstant(Op->Src2, &Const)) { // We can promote 8/16-bit tests to 32-bit since the constant is masked. LOGMAN_THROW_A_FMT(!(Const & ~Mask), "constant is already masked"); tst(EmitSize, Src1, Const); } else { const auto Src2 = GetReg(Op->Src2); if (Src1 == Src2) { tst(EmitSize, Src1 /* Src2 */, Mask); } else { and_(EmitSize, TMP1, Src1, Src2); tst(EmitSize, TMP1, Mask); } } } DEF_OP(SubShift) { auto Op = IROp->C(); sub(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2), ConvertIRShiftType(Op->Shift), Op->ShiftAmount); } DEF_OP(SubNZCV) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { LOGMAN_THROW_A_FMT(OpSize >= IR::OpSize::i32Bit, "Constant not allowed here"); cmp(EmitSize, GetReg(Op->Src1), Const); } else { unsigned Shift = OpSize < IR::OpSize::i32Bit ? (32 - IR::OpSizeAsBits(OpSize)) : 0; ARMEmitter::Register ShiftedSrc1 = GetZeroableReg(Op->Src1); // Shift to fix flags for <32-bit ops. // Any shift of zero is still zero so optimize out silly zero shifts. if (OpSize < IR::OpSize::i32Bit && ShiftedSrc1 != ARMEmitter::Reg::zr) { lsl(ARMEmitter::Size::i32Bit, TMP1, ShiftedSrc1, Shift); ShiftedSrc1 = TMP1; } if (OpSize < IR::OpSize::i32Bit) { cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2), ARMEmitter::ShiftType::LSL, Shift); } else { cmp(EmitSize, ShiftedSrc1, GetReg(Op->Src2)); } } } DEF_OP(CmpPairZ) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); // Save NZCV mrs(TMP1, ARMEmitter::SystemRegister::NZCV); // Compare, setting Z and clobbering NzCV cmp(EmitSize, GetReg(Op->Src1Lo), GetReg(Op->Src2Lo)); ccmp(EmitSize, GetReg(Op->Src1Hi), GetReg(Op->Src2Hi), ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ); // Restore NzCV if (CTX->HostFeatures.SupportsFlagM) { rmif(TMP1, 0, 0xb /* NzCV */); } else { cset(ARMEmitter::Size::i32Bit, TMP2, ARMEmitter::Condition::CC_EQ); bfi(ARMEmitter::Size::i32Bit, TMP1, TMP2, 30 /* lsb: Z */, 1); msr(ARMEmitter::SystemRegister::NZCV, TMP1); } } DEF_OP(CarryInvert) { LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op"); cfinv(); } DEF_OP(RmifNZCV) { auto Op = IROp->C(); LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op"); rmif(GetZeroableReg(Op->Src).X(), Op->Rotate, Op->Mask); } DEF_OP(SetSmallNZV) { auto Op = IROp->C(); LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op"); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i8Bit || OpSize == IR::OpSize::i16Bit, "Unsupported {} size: {}", __func__, OpSize); if (OpSize == IR::OpSize::i8Bit) { setf8(GetReg(Op->Src).W()); } else { setf16(GetReg(Op->Src).W()); } } DEF_OP(AXFlag) { if (CTX->HostFeatures.SupportsFlagM2) { axflag(); } else { // AXFLAG is defined in the Arm spec as // // gt: nzCv -> nzCv // lt: Nzcv -> nzcv <==> 1 + 0 // eq: nZCv -> nZCv <==> 1 + (~0) // un: nzCV -> nZcv <==> 0 + 0 // // For the latter 3 cases, we therefore get the right NZCV by adding V_inv // to (eq ? ~0 : 0). The remaining case is forced with ccmn. auto V_inv = GetReg(IROp->Args[0]); csetm(ARMEmitter::Size::i64Bit, TMP1, ARMEmitter::Condition::CC_EQ); ccmn(ARMEmitter::Size::i64Bit, V_inv, TMP1, ARMEmitter::StatusFlags {0x2} /* nzCv */, ARMEmitter::Condition::CC_LE); } } DEF_OP(Parity) { auto Op = IROp->C(); auto Raw = GetReg(Op->Raw); auto Dest = GetReg(Node); // Cascade to calculate parity of bottom 8-bits to bottom bit. eor(ARMEmitter::Size::i32Bit, TMP1, Raw, Raw, ARMEmitter::ShiftType::LSR, 4); eor(ARMEmitter::Size::i32Bit, TMP1, TMP1, TMP1, ARMEmitter::ShiftType::LSR, 2); if (Op->Invert) { eon(ARMEmitter::Size::i32Bit, Dest, TMP1, TMP1, ARMEmitter::ShiftType::LSR, 1); } else { eor(ARMEmitter::Size::i32Bit, Dest, TMP1, TMP1, ARMEmitter::ShiftType::LSR, 1); } // The above sequence leaves garbage in the upper bits. if (Op->Mask) { and_(ARMEmitter::Size::i32Bit, Dest, Dest, 1); } } DEF_OP(CondAddNZCV) { auto Op = IROp->C(); ARMEmitter::StatusFlags Flags = (ARMEmitter::StatusFlags)Op->FalseNZCV; uint64_t Const = 0; auto Src1 = GetZeroableReg(Op->Src1); if (IsInlineConstant(Op->Src2, &Const)) { ccmn(ConvertSize48(IROp), Src1, Const, Flags, MapCC(Op->Cond)); } else { ccmn(ConvertSize48(IROp), Src1, GetReg(Op->Src2), Flags, MapCC(Op->Cond)); } } DEF_OP(CondSubNZCV) { auto Op = IROp->C(); ARMEmitter::StatusFlags Flags = (ARMEmitter::StatusFlags)Op->FalseNZCV; uint64_t Const = 0; auto Src1 = GetZeroableReg(Op->Src1); if (IsInlineConstant(Op->Src2, &Const)) { ccmp(ConvertSize48(IROp), Src1, Const, Flags, MapCC(Op->Cond)); } else { ccmp(ConvertSize48(IROp), Src1, GetReg(Op->Src2), Flags, MapCC(Op->Cond)); } } DEF_OP(Neg) { auto Op = IROp->C(); if (Op->Cond == IR::CondClass::AL) { neg(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src)); } else { cneg(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src), MapCC(Op->Cond)); } } DEF_OP(Mul) { auto Op = IROp->C(); mul(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(UMul) { auto Op = IROp->C(); mul(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2)); } DEF_OP(UMull) { auto Op = IROp->C(); umull(GetReg(Node).X(), GetReg(Op->Src1).W(), GetReg(Op->Src2).W()); } DEF_OP(SMull) { auto Op = IROp->C(); smull(GetReg(Node).X(), GetReg(Op->Src1).W(), GetReg(Op->Src2).W()); } DEF_OP(MulH) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); const auto Src2 = GetReg(Op->Src2); if (OpSize == IR::OpSize::i32Bit) { sxtw(TMP1, Src1.W()); sxtw(TMP2, Src2.W()); mul(ARMEmitter::Size::i32Bit, Dst, TMP1, TMP2); ubfx(ARMEmitter::Size::i32Bit, Dst, Dst, 32, 32); } else { smulh(Dst.X(), Src1.X(), Src2.X()); } } DEF_OP(UMulH) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); const auto Src2 = GetReg(Op->Src2); if (OpSize == IR::OpSize::i32Bit) { uxtw(ARMEmitter::Size::i64Bit, TMP1, Src1); uxtw(ARMEmitter::Size::i64Bit, TMP2, Src2); mul(ARMEmitter::Size::i64Bit, Dst, TMP1, TMP2); ubfx(ARMEmitter::Size::i64Bit, Dst, Dst, 32, 32); } else { umulh(Dst.X(), Src1.X(), Src2.X()); } } DEF_OP(Orlshl) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { orr(ConvertSize(IROp), Dst, Src1, Const << Op->BitShift); } else { const auto Src2 = GetReg(Op->Src2); orr(ConvertSize(IROp), Dst, Src1, Src2, ARMEmitter::ShiftType::LSL, Op->BitShift); } } DEF_OP(Orlshr) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { orr(ConvertSize(IROp), Dst, Src1, Const >> Op->BitShift); } else { const auto Src2 = GetReg(Op->Src2); orr(ConvertSize(IROp), Dst, Src1, Src2, ARMEmitter::ShiftType::LSR, Op->BitShift); } } DEF_OP(Ornror) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); const auto Src2 = GetReg(Op->Src2); orn(ConvertSize(IROp), Dst, Src1, Src2, ARMEmitter::ShiftType::ROR, Op->BitShift); } DEF_OP(AndWithFlags) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); uint64_t Const; const auto Dst = GetReg(Node); auto Src1 = GetReg(Op->Src1); // See TestNZ if (OpSize < IR::OpSize::i32Bit) { if (IsInlineConstant(Op->Src2, &Const)) { and_(EmitSize, Dst, Src1, Const); } else { auto Src2 = GetReg(Op->Src2); if (Src1 != Src2) { and_(EmitSize, Dst, Src1, Src2); } else if (Dst != Src1) { mov(ARMEmitter::Size::i64Bit, Dst, Src1); } } unsigned Shift = 32 - IR::OpSizeAsBits(OpSize); cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift); } else { if (IsInlineConstant(Op->Src2, &Const)) { ands(EmitSize, Dst, Src1, Const); } else { const auto Src2 = GetReg(Op->Src2); ands(EmitSize, Dst, Src1, Src2); } } } DEF_OP(AndShift) { auto Op = IROp->C(); and_(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2), ConvertIRShiftType(Op->Shift), Op->ShiftAmount); } DEF_OP(XorShift) { auto Op = IROp->C(); eor(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2), ConvertIRShiftType(Op->Shift), Op->ShiftAmount); } DEF_OP(XornShift) { auto Op = IROp->C(); eon(ConvertSize48(IROp), GetReg(Node), GetReg(Op->Src1), GetReg(Op->Src2), ConvertIRShiftType(Op->Shift), Op->ShiftAmount); } DEF_OP(Ashr) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); uint64_t Const; if (IsInlineConstant(Op->Src2, &Const)) { if (OpSize >= IR::OpSize::i32Bit) { asr(EmitSize, Dst, Src1, (unsigned int)Const); } else { sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize)); asr(EmitSize, Dst, TMP1, (unsigned int)Const); ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize)); } } else { const auto Src2 = GetReg(Op->Src2); if (OpSize >= IR::OpSize::i32Bit) { asrv(EmitSize, Dst, Src1, Src2); } else { sbfx(EmitSize, TMP1, Src1, 0, IR::OpSizeAsBits(OpSize)); asrv(EmitSize, Dst, TMP1, Src2); ubfx(EmitSize, Dst, Dst, 0, IR::OpSizeAsBits(OpSize)); } } } DEF_OP(ShiftFlags) { auto Op = IROp->C(); const auto OpSize = Op->Size; const auto EmitSize = OpSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto PFOutput = GetReg(Node); const auto PFInput = GetReg(Op->PFInput); const auto Dst = GetReg(Op->Result); const auto Src1 = GetReg(Op->Src1); const auto Src2 = GetReg(Op->Src2); bool PFBlocked = (PFOutput == Dst) || (PFOutput == Src1) || (PFOutput == Src2); const auto PFTemp = PFBlocked ? TMP4 : PFOutput; // Set the output outside the branch to avoid needing an extra leg of the // branch. We specifically do not hardcode the PF register anywhere (relying // on a tied SRA register instead) to avoid fighting with RA. if (PFTemp != PFInput) { mov(ARMEmitter::Size::i64Bit, PFTemp, PFInput); } // We need to mask the source before comparing it. We don't just skip flag // updates for Src2=0 but anything that masks to zero. and_(ARMEmitter::Size::i32Bit, TMP1, Src2, OpSize == IR::OpSize::i64Bit ? 0x3f : 0x1f); ARMEmitter::ForwardLabel Done; (void)cbz(EmitSize, TMP1, &Done); { // PF/SF/ZF/OF if (OpSize >= IR::OpSize::i32Bit) { ands(EmitSize, PFTemp, Dst, Dst); } else { unsigned Shift = 32 - (IR::OpSizeToSize(OpSize) * 8); cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift); mov(ARMEmitter::Size::i64Bit, PFTemp, Dst); } auto CFWord = TMP1; unsigned CFBit = 0; // Extract the last bit shifted in to CF if (Op->Shift == IR::ShiftType::LSL) { if (OpSize >= IR::OpSize::i32Bit) { neg(EmitSize, CFWord, Src2); lsrv(EmitSize, CFWord, Src1, CFWord); } else { CFWord = Dst.X(); CFBit = IR::OpSizeToSize(OpSize) * 8; } } else { sub(ARMEmitter::Size::i64Bit, CFWord, Src2, 1); lsrv(EmitSize, CFWord, Src1, CFWord); } if (Op->InvertCF) { mvn(ARMEmitter::Size::i64Bit, TMP1, CFWord); CFWord = TMP1; } bool SetOF = Op->Shift != IR::ShiftType::ASR; if (SetOF) { // Only defined when Shift is 1 else undefined // OF flag is set if a sign change occurred eor(EmitSize, TMP3, Src1, Dst); } if (CTX->HostFeatures.SupportsFlagM) { rmif(CFWord, (CFBit - 1) % 64, (1 << 1) /* C */); if (SetOF) { rmif(TMP3, IR::OpSizeToSize(OpSize) * 8 - 1, (1 << 0) /* V */); } } else { mrs(TMP2, ARMEmitter::SystemRegister::NZCV); if (CFBit != 0) { lsr(ARMEmitter::Size::i64Bit, TMP1, CFWord, CFBit); CFWord = TMP1; } bfi(ARMEmitter::Size::i32Bit, TMP2, CFWord, 29 /* C */, 1); if (SetOF) { lsr(EmitSize, TMP3, TMP3, IR::OpSizeToSize(OpSize) * 8 - 1); bfi(ARMEmitter::Size::i32Bit, TMP2, TMP3, 28 /* V */, 1); } msr(ARMEmitter::SystemRegister::NZCV, TMP2); } } (void)Bind(&Done); // TODO: Make RA less dumb so this can't happen (e.g. with late-kill). if (PFOutput != PFTemp) { mov(ARMEmitter::Size::i64Bit, PFOutput, PFTemp); } } DEF_OP(RotateFlags) { auto Op = IROp->C(); const auto Result = GetReg(Op->Result); const auto Shift = GetReg(Op->Shift); const bool Left = Op->Left; const auto EmitSize = Op->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; // If shift=0, flags are unaffected. Wrap the whole implementation in a cbz. ARMEmitter::ForwardLabel Done; (void)cbz(EmitSize, Shift, &Done); { // Extract the last bit shifted in to CF const auto BitSize = IR::OpSizeToSize(Op->Size) * 8; unsigned CFBit = Left ? 0 : BitSize - 1; // For ROR, OF is the XOR of the new CF bit and the most significant bit of the result. // For ROL, OF is the LSB and MSB XOR'd together. // OF is architecturally only defined for 1-bit rotate. eor(ARMEmitter::Size::i64Bit, TMP1, Result, Result, ARMEmitter::ShiftType::LSR, Left ? BitSize - 1 : 1); unsigned OFBit = Left ? 0 : BitSize - 2; // Invert result so we get inverted carry. mvn(ARMEmitter::Size::i64Bit, TMP2, Result); if (CTX->HostFeatures.SupportsFlagM) { rmif(TMP2, (CFBit - 1) % 64, 1 << 1 /* nzCv */); rmif(TMP1, OFBit, 1 << 0 /* nzcV */); } else { if (OFBit != 0) { lsr(EmitSize, TMP1, TMP1, OFBit); } if (CFBit != 0) { lsr(EmitSize, TMP2, TMP2, CFBit); } mrs(TMP3, ARMEmitter::SystemRegister::NZCV); bfi(ARMEmitter::Size::i32Bit, TMP3, TMP1, 28 /* V */, 1); bfi(ARMEmitter::Size::i32Bit, TMP3, TMP2, 29 /* C */, 1); msr(ARMEmitter::SystemRegister::NZCV, TMP3); } } (void)Bind(&Done); } DEF_OP(Extr) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Upper = GetReg(Op->Upper); const auto Lower = GetReg(Op->Lower); extr(ConvertSize48(IROp), Dst, Upper, Lower, Op->LSB); } DEF_OP(PDep) { auto Op = IROp->C(); const auto EmitSize = ConvertSize48(IROp); const auto Dest = GetReg(Node); // We can't clobber these const auto OrigInput = GetReg(Op->Input); const auto OrigMask = GetReg(Op->Mask); if (CTX->HostFeatures.SupportsSVEBitPerm) { // SVE added support for PDEP but it needs to be done in a vector register. if (EmitSize == ARMEmitter::Size::i32Bit) { fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), OrigInput.W()); fmov(ARMEmitter::Size::i32Bit, VTMP2.S(), OrigMask.W()); bdep(ARMEmitter::SubRegSize::i32Bit, VTMP1.Z(), VTMP1.Z(), VTMP2.Z()); umov(Dest, VTMP1, 0); } else { fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), OrigInput.X()); fmov(ARMEmitter::Size::i64Bit, VTMP2.D(), OrigMask.X()); bdep(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), VTMP1.Z(), VTMP2.Z()); umov(Dest, VTMP1, 0); } } else { // PDep implementation follows the ideas from // http://0x80.pl/articles/pdep-soft-emu.html ... Basically, iterate the *set* // bits only, which will be faster than the naive implementation as long as // there are enough holes in the mask. // // The specific arm64 assembly used is based on the sequence that clang // generates for the C code, giving context to the scheduling yielding better // ILP than I would do by hand. The registers are allocated by hand however, // to fit within the tight constraints we have here withot spilling. Also, we // use cbz/cbnz for conditional branching to avoid clobbering NZCV. // So we have shadow as temporaries const auto Input = TMP1.R(); const auto Mask = TMP2.R(); // these get used variously as scratch const auto T0 = TMP3.R(); const auto T1 = TMP4.R(); ARMEmitter::BackwardLabel NextBit; ARMEmitter::ForwardLabel Done; // First, copy the input/mask, since we'll be clobbering. Copy as 64-bit to // make this 0-uop on Firestorm. mov(ARMEmitter::Size::i64Bit, Input, OrigInput); mov(ARMEmitter::Size::i64Bit, Mask, OrigMask); // Now, they're copied, so we can start setting Dest (even if it overlaps with // one of them). Handle early exit case mov(EmitSize, Dest, 0); (void)cbz(EmitSize, OrigMask, &Done); // Setup for first iteration neg(EmitSize, T0, Mask); and_(EmitSize, T0, T0, Mask); // Main loop (void)Bind(&NextBit); sbfx(EmitSize, T1, Input, 0, 1); eor(EmitSize, Mask, Mask, T0); and_(EmitSize, T0, T1, T0); neg(EmitSize, T1, Mask); orr(EmitSize, Dest, Dest, T0); lsr(EmitSize, Input, Input, 1); and_(EmitSize, T0, Mask, T1); (void)cbnz(EmitSize, T0, &NextBit); // All done with nothing to do. (void)Bind(&Done); } } DEF_OP(PExt) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto OpSizeBitsM1 = IR::OpSizeAsBits(OpSize) - 1; const auto EmitSize = ConvertSize48(IROp); const auto Input = GetReg(Op->Input); const auto Mask = GetReg(Op->Mask); const auto Dest = GetReg(Node); if (CTX->HostFeatures.SupportsSVEBitPerm) { // SVE added support for PEXT but it needs to be done in a vector register. if (EmitSize == ARMEmitter::Size::i32Bit) { fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Input.W()); fmov(ARMEmitter::Size::i32Bit, VTMP2.S(), Mask.W()); bext(ARMEmitter::SubRegSize::i32Bit, VTMP1.Z(), VTMP1.Z(), VTMP2.Z()); umov(Dest, VTMP1, 0); } else { fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), Input.X()); fmov(ARMEmitter::Size::i64Bit, VTMP2.D(), Mask.X()); bext(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), VTMP1.Z(), VTMP2.Z()); umov(Dest, VTMP1, 0); } } else { const auto MaskReg = TMP1; const auto BitReg = TMP2; const auto ValueReg = TMP3; ARMEmitter::ForwardLabel EarlyExit; ARMEmitter::BackwardLabel NextBit; ARMEmitter::ForwardLabel Done; (void)cbz(EmitSize, Mask, &EarlyExit); mov(EmitSize, MaskReg, Mask); mov(EmitSize, ValueReg, Input); mov(EmitSize, Dest, ARMEmitter::Reg::zr); // Main loop (void)Bind(&NextBit); (void)cbz(EmitSize, MaskReg, &Done); clz(EmitSize, BitReg, MaskReg); lslv(EmitSize, ValueReg, ValueReg, BitReg); lslv(EmitSize, MaskReg, MaskReg, BitReg); extr(EmitSize, Dest, Dest, ValueReg, OpSizeBitsM1); bfc(EmitSize, MaskReg, OpSizeBitsM1, 1); (void)b(&NextBit); // Early exit (void)Bind(&EarlyExit); mov(EmitSize, Dest, ARMEmitter::Reg::zr); // All done with nothing to do. (void)Bind(&Done); } } DEF_OP(Div) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Quotient = GetReg(Op->OutQuotient); const auto Remainder = GetReg(Op->OutRemainder); auto Lower = GetReg(Op->Lower); auto Divisor = GetReg(Op->Divisor); if (Op->Upper.IsInvalid()) { const auto EmitSize = ConvertSize(IROp); if (OpSize == IR::OpSize::i8Bit) { sxtb(EmitSize, TMP1, Lower); sxtb(EmitSize, TMP2, Divisor); Lower = TMP1; Divisor = TMP2; } else if (OpSize == IR::OpSize::i16Bit) { sxth(EmitSize, TMP1, Lower); sxth(EmitSize, TMP2, Divisor); Lower = TMP1; Divisor = TMP2; } sdiv(EmitSize, Quotient, Lower, Divisor); msub(EmitSize, Remainder, Quotient, Divisor, Lower); return; } const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto Upper = GetReg(Op->Upper); // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64 switch (OpSize) { case IR::OpSize::i16Bit: { uxth(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 16, 16); sxth(EmitSize, TMP2, Divisor); sdiv(EmitSize, Quotient, TMP1, TMP2); msub(EmitSize, Remainder, Quotient, TMP2, TMP1); break; } case IR::OpSize::i32Bit: { // TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits. mov(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 32, 32); sxtw(TMP2, Divisor.W()); sdiv(EmitSize, Quotient, TMP1, TMP2); msub(EmitSize, Remainder, Quotient, TMP2, TMP1); break; } case IR::OpSize::i64Bit: { ARMEmitter::ForwardLabel Only64Bit {}; ARMEmitter::ForwardLabel LongDIVRet {}; // Check if the upper bits match the top bit of the lower 64-bits // Sign extend the top bit of lower bits sbfx(EmitSize, TMP1, Lower, 63, 1); eor(EmitSize, TMP1, TMP1, Upper); // If the sign bit matches then the result is zero (void)cbz(EmitSize, TMP1, &Only64Bit); // Long divide { mov(EmitSize, TMP1, Upper); mov(EmitSize, TMP2, Lower); mov(EmitSize, TMP3, Divisor); ldr(TMP4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.LDIVHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); blr(TMP4); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); // Move results to the destination registers mov(EmitSize, Quotient, TMP1); mov(EmitSize, Remainder, TMP2); // Skip 64-bit path (void)b(&LongDIVRet); } (void)Bind(&Only64Bit); // 64-Bit only { sdiv(EmitSize, Quotient, Lower, Divisor); msub(EmitSize, Remainder, Quotient, Divisor, Lower); } (void)Bind(&LongDIVRet); break; } default: LOGMAN_MSG_A_FMT("Unknown DIV Size: {}", OpSize); break; } } DEF_OP(UDiv) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Quotient = GetReg(Op->OutQuotient); const auto Remainder = GetReg(Op->OutRemainder); const auto Lower = GetReg(Op->Lower); const auto Divisor = GetReg(Op->Divisor); // Each source is OpSize in size // So you can have up to a 128bit divide from x86-64= if (Op->Upper.IsInvalid()) { const auto EmitSize = ConvertSize(IROp); udiv(EmitSize, Quotient, Lower, Divisor); msub(EmitSize, Remainder, Quotient, Divisor, Lower); return; } const auto EmitSize = OpSize >= IR::OpSize::i32Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; const auto Upper = GetReg(Op->Upper); switch (OpSize) { case IR::OpSize::i16Bit: { uxth(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 16, 16); udiv(EmitSize, Quotient, TMP1, Divisor); msub(EmitSize, Remainder, Quotient, Divisor, TMP1); break; } case IR::OpSize::i32Bit: { // We need to mask divisor if we have Upper bits, since the frontend does // not on the hope that we can optimize to use the path above. mov(ARMEmitter::Size::i32Bit, TMP2, Divisor); // TODO: 32-bit operation should be guaranteed not to leave garbage in the upper bits. mov(EmitSize, TMP1, Lower); bfi(EmitSize, TMP1, Upper, 32, 32); udiv(EmitSize, Quotient, TMP1, TMP2); msub(EmitSize, Remainder, Quotient, TMP2, TMP1); break; } case IR::OpSize::i64Bit: { ARMEmitter::ForwardLabel Only64Bit {}; ARMEmitter::ForwardLabel LongDIVRet {}; // Check the upper bits for zero // If the upper bits are zero then we can do a 64-bit divide (void)cbz(EmitSize, Upper, &Only64Bit); // Long divide { mov(EmitSize, TMP1, Upper); mov(EmitSize, TMP2, Lower); mov(EmitSize, TMP3, Divisor); ldr(TMP4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.LUDIVHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); blr(TMP4); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); // Move results to the destination registers mov(EmitSize, Quotient, TMP1); mov(EmitSize, Remainder, TMP2); // Skip 64-bit path (void)b(&LongDIVRet); } (void)Bind(&Only64Bit); // 64-Bit only { udiv(EmitSize, Quotient, Lower, Divisor); msub(EmitSize, Remainder, Quotient, Divisor, Lower); } (void)Bind(&LongDIVRet); break; } default: LOGMAN_MSG_A_FMT("Unknown LUDIV Size: {}", OpSize); break; } } DEF_OP(Not) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); mvn(ConvertSize48(IROp), Dst, Src); } DEF_OP(Popcount) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); if (CTX->HostFeatures.SupportsCSSC) { switch (OpSize) { case IR::OpSize::i8Bit: uxtb(ARMEmitter::Size::i32Bit, Dst, Src); cnt(ARMEmitter::Size::i32Bit, Dst, Dst); break; case IR::OpSize::i16Bit: uxth(ARMEmitter::Size::i32Bit, Dst, Src); cnt(ARMEmitter::Size::i32Bit, Dst, Dst); break; case IR::OpSize::i32Bit: cnt(ARMEmitter::Size::i32Bit, Dst, Src); break; case IR::OpSize::i64Bit: cnt(ARMEmitter::Size::i64Bit, Dst, Src); break; default: LOGMAN_MSG_A_FMT("Unsupported Popcount size: {}", OpSize); } } else { switch (OpSize) { case IR::OpSize::i8Bit: fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src); // only use lowest byte cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); break; case IR::OpSize::i16Bit: fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src); cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); // only count two lowest bytes addp(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D(), VTMP1.D()); break; case IR::OpSize::i32Bit: fmov(ARMEmitter::Size::i32Bit, VTMP1.S(), Src); cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); // fmov has zero extended, unused bytes are zero addv(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); break; case IR::OpSize::i64Bit: fmov(ARMEmitter::Size::i64Bit, VTMP1.D(), Src); cnt(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); // fmov has zero extended, unused bytes are zero addv(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); break; default: LOGMAN_MSG_A_FMT("Unsupported Popcount size: {}", OpSize); } umov(Dst, VTMP1, 0); } } DEF_OP(FindLSB) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); // We assume the source is nonzero, so we can just rbit+clz without worrying // about upper garbage for smaller types. rbit(EmitSize, TMP1, Src); clz(EmitSize, Dst, TMP1); } DEF_OP(FindMSB) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); movz(ARMEmitter::Size::i64Bit, TMP1, IR::OpSizeAsBits(OpSize) - 1); if (OpSize == IR::OpSize::i16Bit) { lsl(EmitSize, Dst, Src, 16); clz(EmitSize, Dst, Dst); } else { clz(EmitSize, Dst, Src); } sub(ARMEmitter::Size::i64Bit, Dst, TMP1, Dst); } DEF_OP(FindTrailingZeroes) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); rbit(EmitSize, Dst, Src); if (OpSize == IR::OpSize::i16Bit) { // This orr does two things. First, if the (masked) source is zero, it // reverses to zero in the top so it forces clz to return 16. Second, it // ensures garbage in the upper bits of the source don't affect clz, because // they'll rbit to garbage in the bottom below the 0x8000 and be ignored by // the clz. So we handle Src upper garbage without explicitly masking. orr(EmitSize, Dst, Dst, 0x8000); } clz(EmitSize, Dst, Dst); } DEF_OP(CountLeadingZeroes) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); if (OpSize == IR::OpSize::i16Bit) { // Expressing as lsl+orr+clz clears away any garbage in the upper bits // (alternatively could do uxth+clz+sub.. equal cost in total). lsl(EmitSize, Dst, Src, 16); orr(EmitSize, Dst, Dst, 0x8000); clz(EmitSize, Dst, Dst); } else { clz(EmitSize, Dst, Src); } } DEF_OP(Rev) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); rev(EmitSize, Dst, Src); if (OpSize == IR::OpSize::i16Bit) { lsr(EmitSize, Dst, Dst, 16); } } DEF_OP(Rbit) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i64Bit, "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = ConvertSize48(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); rbit(EmitSize, Dst, Src); } DEF_OP(Bfi) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto SrcDst = GetReg(Op->Dest); const auto Src = GetReg(Op->Src); if (Dst == SrcDst) { // If Dst and SrcDst match then this turns in to a simple BFI instruction. bfi(EmitSize, Dst, Src, Op->lsb, Op->Width); } else if (Dst != Src) { // If the destination isn't the source then we can move the DstSrc and insert directly. // // The move is 64-bit to allow register renaming, the upper bits don't // matter because of the bfi's EmitSize. mov(ARMEmitter::Size::i64Bit, Dst, SrcDst); bfi(EmitSize, Dst, Src, Op->lsb, Op->Width); } else { // Destination didn't match the dst source register. // TODO: Inefficient until FEX can have RA constraints here. mov(EmitSize, TMP1, SrcDst); bfi(EmitSize, TMP1, Src, Op->lsb, Op->Width); if (IROp->Size >= IR::OpSize::i32Bit) { mov(EmitSize, Dst, TMP1.R()); } else { ubfx(EmitSize, Dst, TMP1, 0, IR::OpSizeAsBits(IROp->Size)); } } } DEF_OP(Bfxil) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto SrcDst = GetReg(Op->Dest); const auto Src = GetReg(Op->Src); if (Dst == SrcDst) { // If Dst and SrcDst match then this turns in to a single instruction. bfxil(EmitSize, Dst, Src, Op->lsb, Op->Width); } else if (Dst != Src) { // If the destination isn't the source then we can move the DstSrc and insert directly. mov(EmitSize, Dst, SrcDst); bfxil(EmitSize, Dst, Src, Op->lsb, Op->Width); } else { // Destination didn't match the dst source register. // TODO: Inefficient until FEX can have RA constraints here. mov(EmitSize, TMP1, SrcDst); bfxil(EmitSize, TMP1, Src, Op->lsb, Op->Width); mov(EmitSize, Dst, TMP1.R()); } } DEF_OP(Bfe) { auto Op = IROp->C(); LOGMAN_THROW_A_FMT(IROp->Size <= IR::OpSize::i64Bit, "OpSize is too large for BFE: {}", IROp->Size); LOGMAN_THROW_A_FMT(Op->Width != 0, "Invalid BFE width of 0"); const auto EmitSize = ConvertSize(IROp); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); if (Op->lsb == 0 && Op->Width == 32) { mov(ARMEmitter::Size::i32Bit, Dst, Src); } else if (Op->lsb == 0 && Op->Width == 64) { LOGMAN_THROW_A_FMT(IROp->Size == IR::OpSize::i64Bit, "Must be 64-bit wide register"); mov(ARMEmitter::Size::i64Bit, Dst, Src); } else { ubfx(EmitSize, Dst, Src, Op->lsb, Op->Width); } } DEF_OP(Sbfe) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Src = GetReg(Op->Src); sbfx(ConvertSize(IROp), Dst, Src, Op->lsb, Op->Width); } DEF_OP(MaskGenerateFromBitWidth) { auto Op = IROp->C(); auto BitWidth = GetReg(Op->BitWidth); LoadConstant(ARMEmitter::Size::i64Bit, TMP1, -1); cmp(ARMEmitter::Size::i64Bit, BitWidth, 0); lslv(ARMEmitter::Size::i64Bit, TMP2, TMP1, BitWidth); csinv(ARMEmitter::Size::i64Bit, GetReg(Node), TMP1, TMP2, ARMEmitter::Condition::CC_EQ); } DEF_OP(Select) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto EmitSize = ConvertSize(IROp); const auto CompareEmitSize = Op->CompareSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; uint64_t Const; auto cc = MapCC(Op->Cond); if (IsGPR(Op->Cmp1)) { const auto Src1 = GetReg(Op->Cmp1); if (IsInlineConstant(Op->Cmp2, &Const)) { cmp(CompareEmitSize, Src1, Const); } else { const auto Src2 = GetReg(Op->Cmp2); cmp(CompareEmitSize, Src1, Src2); } } else if (IsFPR(Op->Cmp1)) { const auto Src1 = GetVReg(Op->Cmp1); const auto Src2 = GetVReg(Op->Cmp2); fcmp(Op->CompareSize == IR::OpSize::i64Bit ? ARMEmitter::ScalarRegSize::i64Bit : ARMEmitter::ScalarRegSize::i32Bit, Src1, Src2); } else { LOGMAN_MSG_A_FMT("Select: Expected GPR or FPR"); } uint64_t const_true, const_false; bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true); bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false); uint64_t all_ones = OpSize == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; ARMEmitter::Register Dst = GetReg(Node); if (is_const_true || is_const_false) { if (is_const_false != true || is_const_true != true || !(const_true == 1 || const_true == all_ones) || const_false != 0) { LOGMAN_MSG_A_FMT("Select: Unsupported compare inline parameters"); } if (const_true == all_ones) { csetm(EmitSize, Dst, cc); } else { cset(EmitSize, Dst, cc); } } else { csel(EmitSize, Dst, GetReg(Op->TrueVal), GetReg(Op->FalseVal), cc); } } DEF_OP(NZCVSelect) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); auto cc = MapCC(Op->Cond); uint64_t const_true, const_false; bool is_const_true = IsInlineConstant(Op->TrueVal, &const_true); bool is_const_false = IsInlineConstant(Op->FalseVal, &const_false); uint64_t all_ones = IROp->Size == IR::OpSize::i64Bit ? 0xffff'ffff'ffff'ffffull : 0xffff'ffffull; ARMEmitter::Register Dst = GetReg(Node); if (is_const_true) { if (is_const_false != true || !(const_true == 1 || const_true == all_ones) || const_false != 0) { LOGMAN_MSG_A_FMT("NZCVSelect: Unsupported constant"); } if (const_true == all_ones) { csetm(EmitSize, Dst, cc); } else { cset(EmitSize, Dst, cc); } } else { csel(EmitSize, Dst, GetReg(Op->TrueVal), GetZeroableReg(Op->FalseVal), cc); } } DEF_OP(NZCVSelectV) { auto Op = IROp->C(); auto cc = MapCC(Op->Cond); const auto SubRegSize = ConvertSubRegSizePair248(IROp); fcsel(SubRegSize.Scalar, GetVReg(Node), GetVReg(Op->TrueVal), GetVReg(Op->FalseVal), cc); } DEF_OP(NZCVSelectIncrement) { auto Op = IROp->C(); csinc(ConvertSize(IROp), GetReg(Node), GetReg(Op->TrueVal), GetZeroableReg(Op->FalseVal), MapCC(Op->Cond)); } DEF_OP(VExtractToGPR) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; constexpr auto AVXRegBitSize = Core::CPUState::XMM_AVX_REG_SIZE * 8; constexpr auto SSERegBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8; const auto ElementSizeBits = IR::OpSizeAsBits(Op->Header.ElementSize); const auto Offset = ElementSizeBits * Op->Index; const auto Is256Bit = Offset >= SSERegBitSize; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetReg(Node); const auto Vector = GetVReg(Op->Vector); const auto PerformMove = [&](const ARMEmitter::VRegister reg, int index) { switch (OpSize) { case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", OpSize); break; } }; if (Offset < SSERegBitSize) { // Desired data lies within the lower 128-bit lane, so we // can treat the operation as a 128-bit operation, even // when acting on larger register sizes. PerformMove(Vector, Op->Index); } else { LOGMAN_THROW_A_FMT(Is256Bit, "Can't perform 256-bit extraction with op side: {}", OpSize); LOGMAN_THROW_A_FMT(Offset < AVXRegBitSize, "Trying to extract element outside bounds of register. Offset={}, Index={}", Offset, Op->Index); // We need to use the upper 128-bit lane, so lets move it down. // Inverting our dedicated predicate for 128-bit operations selects // all of the top lanes. We can then compact those into a temporary. const auto CompactPred = ARMEmitter::PReg::p0; not_(CompactPred, PRED_TMP_32B.Zeroing(), PRED_TMP_16B); compact(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), CompactPred, Vector.Z()); // Sanitize the zero-based index to work on the now-moved // upper half of the vector. const auto SanitizedIndex = [OpSize, Op] { switch (OpSize) { case IR::OpSize::i8Bit: return Op->Index - 16; case IR::OpSize::i16Bit: return Op->Index - 8; case IR::OpSize::i32Bit: return Op->Index - 4; case IR::OpSize::i64Bit: return Op->Index - 2; default: LOGMAN_MSG_A_FMT("Unhandled OpSize: {}", OpSize); return 0; } }(); // Move the value from the now-low-lane data. PerformMove(VTMP1, SanitizedIndex); } } DEF_OP(Float_ToGPR_ZS) { auto Op = IROp->C(); ARMEmitter::Register Dst = GetReg(Node); ARMEmitter::VRegister Src = GetVReg(Op->Scalar); if (Op->SrcElementSize == IR::OpSize::i64Bit) { fcvtzs(ConvertSize(IROp), Dst, Src.D()); } else { fcvtzs(ConvertSize(IROp), Dst, Src.S()); } } DEF_OP(Float_ToGPR_S) { auto Op = IROp->C(); ARMEmitter::Register Dst = GetReg(Node); ARMEmitter::VRegister Src = GetVReg(Op->Scalar); if (Op->SrcElementSize == IR::OpSize::i64Bit) { frinti(VTMP1.D(), Src.D()); fcvtzs(ConvertSize(IROp), Dst, VTMP1.D()); } else { frinti(VTMP1.S(), Src.S()); fcvtzs(ConvertSize(IROp), Dst, VTMP1.S()); } } DEF_OP(FCmp) { auto Op = IROp->C(); const auto EmitSubSize = Op->ElementSize == IR::OpSize::i64Bit ? ARMEmitter::ScalarRegSize::i64Bit : ARMEmitter::ScalarRegSize::i32Bit; ARMEmitter::VRegister Scalar1 = GetVReg(Op->Scalar1); ARMEmitter::VRegister Scalar2 = GetVReg(Op->Scalar2); fcmp(EmitSubSize, Scalar1, Scalar2); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/Arm64Relocations.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 desc: relocation logic of the arm64 splatter backend $end_info$ */ #include "Interface/Context/Context.h" #include "Interface/Core/JIT/JITClass.h" #include namespace FEXCore::CPU { uint64_t GetNamedSymbolLiteral(FEXCore::Context::ContextImpl& CTX, FEXCore::CPU::RelocNamedSymbolLiteral::NamedSymbol Op) { switch (Op) { case FEXCore::CPU::RelocNamedSymbolLiteral::NamedSymbol::SYMBOL_LITERAL_EXITFUNCTION_LINKER: return CTX.Dispatcher->GetExitFunctionLinkerAddress(); default: ERROR_AND_DIE_FMT("Unknown named symbol literal: {}", static_cast(Op)); } } void Arm64JITCore::InsertNamedThunkRelocation(ARMEmitter::Register Reg, const IR::SHA256Sum& Sum) { Relocation MoveABI {}; MoveABI.NamedThunkMove.Header = {.Offset = GetCursorOffset(), .Type = FEXCore::CPU::RelocationTypes::RELOC_NAMED_THUNK_MOVE}; MoveABI.NamedThunkMove.Symbol = Sum; MoveABI.NamedThunkMove.RegisterIndex = Reg.Idx(); uint64_t Pointer = reinterpret_cast(EmitterCTX->ThunkHandler->LookupThunk(Sum)); // Pointers are required to fit within 48-bit VA space. // TODO: Force 6-byte `MaxSize`, with zext extension to 64-bit. Current code not smart enough to handle negatives. LoadConstant(ARMEmitter::Size::i64Bit, Reg, Pointer, FEXCore::CPU::Arm64Emitter::PadType::AUTOPAD); Relocations.emplace_back(MoveABI); } Arm64JITCore::NamedSymbolLiteralPair Arm64JITCore::InsertNamedSymbolLiteral(FEXCore::CPU::RelocNamedSymbolLiteral::NamedSymbol Op) { uint64_t Pointer = GetNamedSymbolLiteral(*CTX, Op); NamedSymbolLiteralPair Lit { .Lit = Pointer, .MoveABI = { .NamedSymbolLiteral = { .Header = { .Offset = 0, // Set by PlaceNamedSymbolLiteral .Type = FEXCore::CPU::RelocationTypes::RELOC_NAMED_SYMBOL_LITERAL, }, .Symbol = Op, }, }, }; return Lit; } void Arm64JITCore::PlaceNamedSymbolLiteral(NamedSymbolLiteralPair Lit) { switch (Lit.MoveABI.Header.Type) { case RelocationTypes::RELOC_NAMED_SYMBOL_LITERAL: case RelocationTypes::RELOC_GUEST_RIP_LITERAL: { Lit.MoveABI.Header.Offset = GetCursorOffset(); break; } default: ERROR_AND_DIE_FMT("Unknown relocation type for {}", __FUNCTION__); } BindOrRestart(&Lit.Loc); dc64(Lit.Lit); Relocations.emplace_back(Lit.MoveABI); } auto Arm64JITCore::InsertGuestRIPLiteral(uint64_t GuestRIP) -> NamedSymbolLiteralPair { return { .Lit = GuestRIP, .MoveABI = { .GuestRIP = {.Header = { .Offset = 0, // Set by PlaceNamedSymbolLiteral .Type = FEXCore::CPU::RelocationTypes::RELOC_GUEST_RIP_LITERAL, }, // NOTE: Cache serialization will subtract the guest binary base address later to produce consistency results .GuestRIP = GuestRIP}, }, }; } void Arm64JITCore::InsertGuestRIPMove(ARMEmitter::Register Reg, uint64_t Constant) { Relocation MoveABI {}; MoveABI.GuestRIP.Header = {.Offset = GetCursorOffset(), .Type = FEXCore::CPU::RelocationTypes::RELOC_GUEST_RIP_MOVE}; // NOTE: Cache serialization will subtract the guest binary base address later to produce consistency results MoveABI.GuestRIP.GuestRIP = Constant; MoveABI.GuestRIP.RegisterIndex = Reg.Idx(); // Pointers are required to fit within 48-bit VA space. // TODO: Force 6-byte `MaxSize`, with sign extension to 64-bit. Current code not smart enough to handle negatives. // 48-bit sign extension works because x86-64 guests only receive 47-bit VA space, with 48-bit being reserved for kernel. // Additional quirk, "canonical" 48-bit pointers on x86-64, sign extend the 48-bit as well (Which is why kernel pointers are negative). LoadConstant(ARMEmitter::Size::i64Bit, Reg, Constant, FEXCore::CPU::Arm64Emitter::PadType::AUTOPAD); Relocations.emplace_back(MoveABI); } fextl::vector Arm64JITCore::TakeRelocations(uint64_t GuestBaseAddress) { // Rebase relocations to library base address for (auto& Relocation : Relocations) { switch (Relocation.Header.Type) { case FEXCore::CPU::RelocationTypes::RELOC_GUEST_RIP_MOVE: case FEXCore::CPU::RelocationTypes::RELOC_GUEST_RIP_LITERAL: { Relocation.GuestRIP.GuestRIP -= GuestBaseAddress; break; } default:; } } return std::move(Relocations); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "Interface/Context/Context.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/JIT/JITClass.h" namespace FEXCore::CPU { DEF_OP(CASPair) { auto Op = IROp->C(); LOGMAN_THROW_A_FMT(IROp->ElementSize == IR::OpSize::i32Bit || IROp->ElementSize == IR::OpSize::i64Bit, "Wrong element size"); // Size is the size of each pair element auto Dst0 = GetReg(Op->OutLo); auto Dst1 = GetReg(Op->OutHi); auto Expected0 = GetReg(Op->ExpectedLo); auto Expected1 = GetReg(Op->ExpectedHi); auto Desired0 = GetReg(Op->DesiredLo); auto Desired1 = GetReg(Op->DesiredHi); auto MemSrc = GetReg(Op->Addr); const auto EmitSize = IROp->ElementSize == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; if (CTX->HostFeatures.SupportsAtomics) { // RA has heuristics to try to pair sources, but we need to handle the cases // where they fail. We do so by moving to temporaries. Note we use 64-bit // moves here even for 32-bit cmpxchg, for the Firestorm register renamer. if (Desired1.Idx() != (Desired0.Idx() + 1) || Desired0.Idx() & 1) { mov(ARMEmitter::Size::i64Bit, TMP1, Desired0); mov(ARMEmitter::Size::i64Bit, TMP2, Desired1); Desired0 = TMP1; Desired1 = TMP2; } auto CaspalDst0 = Dst0; auto CaspalDst1 = Dst1; if (CaspalDst1.Idx() != (CaspalDst0.Idx() + 1) || CaspalDst0.Idx() & 1) { CaspalDst0 = TMP3; CaspalDst1 = TMP4; } // We can't clobber the source, these moves are inherently required due to // ISA limitations. But by making them 64-bit, Firestorm can rename. mov(ARMEmitter::Size::i64Bit, CaspalDst0, Expected0); mov(ARMEmitter::Size::i64Bit, CaspalDst1, Expected1); caspal(EmitSize, CaspalDst0, CaspalDst1, Desired0, Desired1, MemSrc); if (CaspalDst0 != Dst0) { mov(ARMEmitter::Size::i64Bit, Dst0, CaspalDst0); mov(ARMEmitter::Size::i64Bit, Dst1, CaspalDst1); } } else { // Save NZCV so we don't have to mark this op as clobbering NZCV (the // SupportsAtomics does not clobber atomics and this !SupportsAtomics path // is so slow it's not worth the complexity of splitting the IR op.). We // clobber NZCV inside the hot loop and we can't replace cmp/ccmp/b.ne with // something NZCV-preserving without requiring an extra instruction. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); ARMEmitter::BackwardLabel LoopTop; ARMEmitter::ForwardLabel LoopNotExpected; ARMEmitter::ForwardLabel LoopExpected; (void)Bind(&LoopTop); // This instruction sequence must be synced with HandleCASPAL_Armv8. ldaxp(EmitSize, TMP2, TMP3, MemSrc); cmp(EmitSize, TMP2, Expected0); ccmp(EmitSize, TMP3, Expected1, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ); (void)b(ARMEmitter::Condition::CC_NE, &LoopNotExpected); stlxp(EmitSize, TMP2, Desired0, Desired1, MemSrc); (void)cbnz(EmitSize, TMP2, &LoopTop); mov(EmitSize, Dst0, Expected0); mov(EmitSize, Dst1, Expected1); (void)b(&LoopExpected); (void)Bind(&LoopNotExpected); mov(EmitSize, Dst0, TMP2.R()); mov(EmitSize, Dst1, TMP3.R()); // exclusive monitor needs to be cleared here // Might have hit the case where ldaxr was hit but stlxr wasn't clrex(); (void)Bind(&LoopExpected); // Restore msr(ARMEmitter::SystemRegister::NZCV, TMP1); } } DEF_OP(CAS) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); // DataSrc = *Src1 // if (DataSrc == Src3) { *Src1 == Src2; } Src2 = DataSrc // This will write to memory! Careful! auto Expected = GetReg(Op->Expected); auto Desired = GetReg(Op->Desired); auto MemSrc = GetReg(Op->Addr); auto Dst = GetReg(Node); if (CTX->HostFeatures.SupportsAtomics) { if (Expected == Dst && Dst != MemSrc && Dst != Desired) { casal(SubEmitSize, Dst, Desired, MemSrc); } else { mov(EmitSize, TMP2, Expected); casal(SubEmitSize, TMP2, Desired, MemSrc); mov(EmitSize, Dst, TMP2.R()); } } else { ARMEmitter::BackwardLabel LoopTop; ARMEmitter::ForwardLabel LoopNotExpected; ARMEmitter::ForwardLabel LoopExpected; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); if (IROp->Size == IR::OpSize::i8Bit) { cmp(EmitSize, TMP2, Expected, ARMEmitter::ExtendedType::UXTB, 0); } else if (IROp->Size == IR::OpSize::i16Bit) { cmp(EmitSize, TMP2, Expected, ARMEmitter::ExtendedType::UXTH, 0); } else { cmp(EmitSize, TMP2, Expected); } (void)b(ARMEmitter::Condition::CC_NE, &LoopNotExpected); stlxr(SubEmitSize, TMP3, Desired, MemSrc); (void)cbnz(EmitSize, TMP3, &LoopTop); mov(EmitSize, Dst, Expected); (void)b(&LoopExpected); (void)Bind(&LoopNotExpected); mov(EmitSize, Dst, TMP2.R()); // exclusive monitor needs to be cleared here // Might have hit the case where ldaxr was hit but stlxr wasn't clrex(); (void)Bind(&LoopExpected); } } DEF_OP(AtomicSwap) { auto Op = IROp->C(); const auto OpSize = IROp->Size; LOGMAN_THROW_A_FMT( OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit || OpSize == IR::OpSize::i16Bit || OpSize == IR::OpSize::i8Bit, "Unexpecte" "d CAS " "size"); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = OpSize == IR::OpSize::i64Bit ? ARMEmitter::SubRegSize::i64Bit : OpSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : OpSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : ARMEmitter::SubRegSize::i8Bit; if (CTX->HostFeatures.SupportsAtomics) { ldswpal(SubEmitSize, Src, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); stlxr(SubEmitSize, TMP4, Src, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); ubfm(EmitSize, GetReg(Node), TMP2, 0, IR::OpSizeAsBits(OpSize) - 1); } } DEF_OP(AtomicFetchAdd) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); if (CTX->HostFeatures.SupportsAtomics) { ldaddal(SubEmitSize, Src, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); add(EmitSize, TMP3, TMP2, Src); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(AtomicFetchSub) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); if (CTX->HostFeatures.SupportsAtomics) { neg(EmitSize, TMP2, Src); ldaddal(SubEmitSize, TMP2, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); sub(EmitSize, TMP3, TMP2, Src); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(AtomicFetchAnd) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); if (CTX->HostFeatures.SupportsAtomics) { mvn(EmitSize, TMP2, Src); ldclral(SubEmitSize, TMP2, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); and_(EmitSize, TMP3, TMP2, Src); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(AtomicFetchCLR) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); if (CTX->HostFeatures.SupportsAtomics) { ldclral(SubEmitSize, Src, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); bic(EmitSize, TMP3, TMP2, Src); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(AtomicFetchOr) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); if (CTX->HostFeatures.SupportsAtomics) { ldsetal(SubEmitSize, Src, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); orr(EmitSize, TMP3, TMP2, Src); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(AtomicFetchXor) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); auto Src = GetReg(Op->Value); if (CTX->HostFeatures.SupportsAtomics) { ldeoral(SubEmitSize, Src, GetReg(Node), MemSrc); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); eor(EmitSize, TMP3, TMP2, Src); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(AtomicFetchNeg) { auto Op = IROp->C(); const auto EmitSize = ConvertSize(IROp); const auto SubEmitSize = ConvertSubRegSize8(IROp->Size); auto MemSrc = GetReg(Op->Addr); if (CTX->HostFeatures.SupportsAtomics) { // Use a CAS loop to avoid needing to emulate unaligned LLSC atomics ldr(SubEmitSize, TMP2, MemSrc); ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); mov(EmitSize, TMP4, TMP2); neg(EmitSize, TMP3, TMP2); casal(SubEmitSize, TMP2, TMP3, MemSrc); sub(EmitSize, TMP3, TMP2, TMP4); (void)cbnz(EmitSize, TMP3, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(SubEmitSize, TMP2, MemSrc); neg(EmitSize, TMP3, TMP2); stlxr(SubEmitSize, TMP4, TMP3, MemSrc); (void)cbnz(EmitSize, TMP4, &LoopTop); mov(EmitSize, GetReg(Node), TMP2.R()); } } DEF_OP(TelemetrySetValue) { #ifndef FEX_DISABLE_TELEMETRY auto Op = IROp->C(); auto Src = GetReg(Op->Value); ldr(TMP2, STATE_PTR_IDX(CpuStateFrame, Pointers.TelemetryValueAddresses, Op->TelemetryValueIndex)); // Cortex fuses cmp+cset. cmp(ARMEmitter::Size::i32Bit, Src, 0); cset(ARMEmitter::Size::i32Bit, TMP1, ARMEmitter::Condition::CC_NE); if (CTX->HostFeatures.SupportsAtomics) { stsetl(ARMEmitter::SubRegSize::i64Bit, TMP1, TMP2); } else { ARMEmitter::BackwardLabel LoopTop; (void)Bind(&LoopTop); ldaxr(ARMEmitter::SubRegSize::i64Bit, TMP3, TMP2); orr(ARMEmitter::Size::i32Bit, TMP3, TMP3, Src); stlxr(ARMEmitter::SubRegSize::i64Bit, TMP3, TMP3, TMP2); (void)cbnz(ARMEmitter::Size::i32Bit, TMP3, &LoopTop); } #endif } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/BranchOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "Interface/Context/Context.h" #include "FEXCore/IR/IR.h" #include "Interface/Core/LookupCache.h" #include "Interface/Core/JIT/JITClass.h" #include #include #include #include #include namespace FEXCore::CPU { DEF_OP(CallbackReturn) { // spill back to CTX SpillStaticRegs(TMP1); // First we must reset the stack ResetStack(); // We can now lower the ref counter again ldr(ARMEmitter::WReg::w2, STATE, offsetof(FEXCore::Core::CpuStateFrame, SignalHandlerRefCounter)); sub(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r2, 1); str(ARMEmitter::WReg::w2, STATE, offsetof(FEXCore::Core::CpuStateFrame, SignalHandlerRefCounter)); // We need to adjust an additional 8 bytes to get back to the original "misaligned" RSP state ldr(ARMEmitter::XReg::x2, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[X86State::REG_RSP])); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r2, 8); str(ARMEmitter::XReg::x2, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[X86State::REG_RSP])); PopCalleeSavedRegisters(); // Return to the thunk ret(); } DEF_OP(ExitFunction) { auto Op = IROp->C(); ResetStack(); if (CTX->HostFeatures.IsInstCountCI) [[unlikely]] { // Emit function end marker udf(0x420F); } uint64_t NewRIP; if (IsInlineConstant(Op->NewRIP, &NewRIP) || IsInlineEntrypointOffset(Op->NewRIP, &NewRIP)) { #ifdef ARCHITECTURE_arm64ec if (NewRIP < EC_CODE_BITMAP_MAX_ADDRESS && RtlIsEcCode(NewRIP)) { str(REG_CALLRET_SP, STATE_PTR(CpuStateFrame, State.callret_sp)); add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, StaticRegisters[X86State::REG_RSP], 0); InsertGuestRIPMove(EC_CALL_CHECKER_PC_REG, NewRIP); ldr(TMP2, STATE_PTR(CpuStateFrame, Pointers.ExitFunctionEC)); br(TMP2); } else { #endif // In order to support direct branches without constantly hitting the L1 cache, we emit a call to a block linker, // this will compile the branch target block when it is hit and replace the branch to the linker at the callsite // with a direct branch to the destination block. Upon invalidation of the target block the backpatch is undone. // // In addition, to avoid needing to lookup in the cache for returns and any indirect branch prediction penalty, // a shadow stack of pairs is maintained, acting as a first level cache for any // return operations. As the guest may not balance calls and returns exactly, an exception handler is expected to // be installed by the frontend, to reset the shadow stack to the middle of its valid bounds on overflow/underflow. // This shadow stack is also cleared on block invalidation operations or codebuffer switches, to ensure all pointed-to // host code is always valid. // This code will be backpatched by Arm64JITCore_ExitFunctionLink, below is an enumeration of all the possible cases. // Jump thunks are emitted in JIT.cpp after compilation of the entire multiblock. // // Call with known return block - unlinked // 00: adr TMP1, 0xC // 04: stp RetReg, TMP1, [SpReg, -0x10]! // 08: bl JmpThunk00 // JmpThunk00: // 00: b 0x8 // 04: br TMP1 // 08: ldr TMP1, // 0c: blr TMP1 // 10: HostCode // 18: GuestRIP // 20: CallerOffset // // Call with known return block after backpatching - linked in branch immediate range // 00: adr TMP1, 0xC // 04: stp RetReg, TMP1, [SpReg, -0x10]! // 08: bl HostCode - MODIFIED // // Call with known return block after backpatching - linked out of range // 00: adr TMP1, 0xC // 04: stp RetReg, TMP1, [SpReg, -0x10]! // 08: bl JmpThunk00 // JmpThunk00: // 00: ldr TMP1, 0x10 - MODIFIED 2nd // 04: br TMP1 // 08: ldr TMP1, // 0c: blr TMP1 // 10: HostCode - MODIFIED 1st // 18: GuestRIP // 20: CallerOffset // // Jump - unlinked // 00: b JmpThunk00 // JmpThunk00: // 00: b 0x8 // 04: br TMP1 // 08: ldr TMP1, // 0c: blr TMP1 // 10: HostCode // 18: GuestRIP // 20: CallerOffset // // Jump after backpatching - linked in branch immediate range // 00: b HostCode - MODIFIED // // Jump after backpatching - linked out of range // 00: b JmpThunk00 // JmpThunk00: // 00: ldr TMP1, 0x10 - MODIFIED 2nd // 04: br TMP1 // 08: ldr TMP1, // 0c: blr TMP1 // 10: HostCode - MODIFIED 1st // 18: GuestRIP // 20: CallerOffset ARMEmitter::ForwardLabel l_BranchHost; ARMEmitter::ForwardLabel l_CallReturn; if (Op->Hint == IR::BranchHint::Call) { if (!Op->CallReturnBlock.IsInvalid()) { auto CallReturnAddressReg = GetReg(Op->CallReturnAddress).X(); PendingCallReturnTargetLabel = &CallReturnTargets.try_emplace(Op->CallReturnBlock.ID()).first->second; (void)adr(TMP1, &l_CallReturn); stp(CallReturnAddressReg, TMP1, REG_CALLRET_SP, -0x10); } else { stp(ARMEmitter::XReg::zr, ARMEmitter::XReg::zr, REG_CALLRET_SP, -0x10); } } else if (Op->Hint == IR::BranchHint::CheckTF) { ARMEmitter::ForwardLabel TFUnset; ldrb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC])); (void)cbz(ARMEmitter::Size::i32Bit, TMP1, &TFUnset); InsertGuestRIPMove(TMP1, NewRIP); str(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.rip)); ldr(TMP2, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.DispatcherLoopTop)); blr(TMP2); (void)Bind(&TFUnset); } EmitLinkedBranch(NewRIP, Op->Hint == IR::BranchHint::Call); (void)Bind(&l_CallReturn); #ifdef ARCHITECTURE_arm64ec } #endif } else { ARMEmitter::ForwardLabel SkipFullLookup; auto RipReg = GetReg(Op->NewRIP); if (Op->Hint == IR::BranchHint::Return) { // First try to pop from the call-ret stack, otherwise follow the normal path (but ending in a ret) ldp(TMP1, TMP2, REG_CALLRET_SP, 0x10); sub(TMP1, TMP1, RipReg.X()); (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &SkipFullLookup); } // L1 Cache ldp(TMP1, TMP2, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.L1Pointer)); // Calculate (tmp1 + ((ripreg & L1_ENTRIES_MASK) << 4)) for the address // L1Mask is pre-shifted. and_(ARMEmitter::Size::i64Bit, TMP2, TMP2, RipReg, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(sizeof(LookupCache::LookupCacheEntry))); add(TMP1, TMP1, TMP2); ldp(TMP2, TMP1, TMP1, 0); // Note: sub+cbnz used over cmp+br to preserve flags. sub(TMP1, TMP1, RipReg.X()); (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &SkipFullLookup); ldr(TMP2, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.DispatcherLoopTop)); str(RipReg.X(), STATE, offsetof(FEXCore::Core::CpuStateFrame, State.rip)); (void)Bind(&SkipFullLookup); if (Op->Hint == IR::BranchHint::Call) { ARMEmitter::ForwardLabel l_CallReturn; if (!Op->CallReturnBlock.IsInvalid()) { auto CallReturnAddressReg = GetReg(Op->CallReturnAddress).X(); PendingCallReturnTargetLabel = &CallReturnTargets.try_emplace(Op->CallReturnBlock.ID()).first->second; (void)adr(TMP1, &l_CallReturn); stp(CallReturnAddressReg, TMP1, REG_CALLRET_SP, -0x10); } else { stp(ARMEmitter::XReg::zr, ARMEmitter::XReg::zr, REG_CALLRET_SP, -0x10); } blr(TMP2); (void)Bind(&l_CallReturn); } else if (Op->Hint == IR::BranchHint::Return) { ret(TMP2); } else { br(TMP2); } } } DEF_OP(Jump) { const auto Op = IROp->C(); PendingTargetLabel = JumpTarget(Op->TargetBlock); } DEF_OP(CondJump) { auto Op = IROp->C(); auto TrueTargetLabel = JumpTarget(Op->TrueBlock); if (Op->FromNZCV) { b_OrRestart(MapCC(Op->Cond), TrueTargetLabel); } else { uint64_t Const; const bool isConst = IsInlineConstant(Op->Cmp2, &Const); auto Reg = GetReg(Op->Cmp1); const auto Size = Op->CompareSize == IR::OpSize::i32Bit ? ARMEmitter::Size::i32Bit : ARMEmitter::Size::i64Bit; LOGMAN_THROW_A_FMT(IsGPR(Op->Cmp1), "CondJump: Expected GPR"); LOGMAN_THROW_A_FMT(isConst, "CondJump: Expected constant source"); if (Op->Cond == IR::CondClass::EQ) { LOGMAN_THROW_A_FMT(Const == 0, "CondJump: Expected 0 source"); cbz_OrRestart(Size, Reg, TrueTargetLabel); } else if (Op->Cond == IR::CondClass::NEQ) { LOGMAN_THROW_A_FMT(Const == 0, "CondJump: Expected 0 source"); cbnz_OrRestart(Size, Reg, TrueTargetLabel); } else if (Op->Cond == IR::CondClass::TSTZ) { LOGMAN_THROW_A_FMT(Const < 64, "CondJump: Expected valid bit source"); tbz_OrRestart(Reg, Const, TrueTargetLabel); } else if (Op->Cond == IR::CondClass::TSTNZ) { LOGMAN_THROW_A_FMT(Const < 64, "CondJump: Expected valid bit source"); tbnz_OrRestart(Reg, Const, TrueTargetLabel); } else { LOGMAN_THROW_A_FMT(false, "CondJump expected simple condition"); } } PendingTargetLabel = JumpTarget(Op->FalseBlock); } DEF_OP(Syscall) { auto Op = IROp->C(); // Arguments are passed as follows: // X0: SyscallHandler // X1: ThreadState // X2: Pointer to SyscallArguments PushDynamicRegs(TMP1); uint32_t GPRSpillMask = ~0U; uint32_t FPRSpillMask = ~0U; SpillStaticRegs(TMP1, { .GPRSpillMask = GPRSpillMask, .FPRSpillMask = FPRSpillMask, }); // Now that we are spilled, store in the state that we are in a syscall // Still without overwriting registers that matter // 16bit LoadConstant to be a single instruction // This gives the signal handler a value to check to see if we are in a syscall at all LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, GPRSpillMask & 0xFFFF); str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, InSyscallInfo)); uint64_t SPOffset = AlignUp(FEXCore::HLE::SyscallArguments::MAX_ARGS * 8, 16); sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, SPOffset); for (uint32_t i = 0; i < FEXCore::HLE::SyscallArguments::MAX_ARGS; ++i) { if (Op->Header.Args[i].IsInvalid()) { continue; } str(GetReg(Op->Header.Args[i]).X(), ARMEmitter::Reg::rsp, i * 8); } ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.SyscallHandlerObj)); ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.SyscallHandlerFunc)); mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, STATE.R()); // SP supporting move add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::rsp, 0); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); } else { blr(ARMEmitter::Reg::r3); } add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, SPOffset); // Result is now in x0 // Fix the stack and any values that were stepped on FillStaticRegs({ .OptionalReg = ARMEmitter::Reg::r1, .OptionalReg2 = ARMEmitter::Reg::r2, .GPRFillMask = GPRSpillMask, .FPRFillMask = FPRSpillMask, }); // Now the registers we've spilled are back in their original host registers // We can safely claim we are no longer in a syscall str(ARMEmitter::XReg::zr, STATE, offsetof(FEXCore::Core::CpuStateFrame, InSyscallInfo)); PopDynamicRegs(); const auto OSABI = CTX->SyscallHandler->GetOSABI(); if (OSABI != FEXCore::HLE::SyscallOSABI::OS_GENERIC) { // Move result to its destination register. // Only if `NORETURNEDRESULT` wasn't set, otherwise we might overwrite the CPUState refilled with `FillStaticRegs` mov(ARMEmitter::Size::i64Bit, GetReg(Node), ARMEmitter::Reg::r0); } } DEF_OP(Thunk) { auto Op = IROp->C(); // Arguments are passed as follows: // X0: CTX // X1: Args (from guest stack) // spill to ctx before ra64 spill SpillStaticRegs(TMP1, { .NZCV = false, }); PushDynamicRegs(TMP1); mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, GetReg(Op->ArgPtr)); InsertNamedThunkRelocation(ARMEmitter::Reg::r2, Op->ThunkNameHash); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } else { blr(ARMEmitter::Reg::r2); } PopDynamicRegs(); // load from ctx after ra64 refill FillStaticRegs({ .NZCV = false, }); } DEF_OP(ValidateCode) { auto Op = IROp->C(); auto OldCode = Op->CodeOriginal.data(); auto Base = GetReg(Op->Header.Args[0]).X(); int len = Op->CodeLength; int Offset = 0; ARMEmitter::ForwardLabel Fail; const auto Dst = GetReg(Node); auto EmitCheck = [&](size_t Size, auto&& LoadData) { while (len >= Size) { LoadData(); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, TMP2); cbnz_OrRestart(ARMEmitter::Size::i64Bit, TMP1, &Fail); len -= Size; Offset += Size; } }; EmitCheck(8, [&]() { ldr(TMP1, Base, Offset); LoadConstant(ARMEmitter::Size::i64Bit, TMP2, *(const uint64_t*)(OldCode + Offset)); }); EmitCheck(4, [&]() { ldr(TMP1.W(), Base, Offset); LoadConstant(ARMEmitter::Size::i32Bit, TMP2, *(const uint32_t*)(OldCode + Offset)); }); EmitCheck(2, [&]() { ldrh(TMP1.W(), Base, Offset); LoadConstant(ARMEmitter::Size::i32Bit, TMP2, *(const uint16_t*)(OldCode + Offset)); }); EmitCheck(1, [&]() { ldrb(TMP1.W(), Base, Offset); LoadConstant(ARMEmitter::Size::i32Bit, TMP2, *(const uint8_t*)(OldCode + Offset)); }); ARMEmitter::ForwardLabel End; LoadConstant(ARMEmitter::Size::i32Bit, Dst, 0); b_OrRestart(&End); BindOrRestart(&Fail); LoadConstant(ARMEmitter::Size::i32Bit, Dst, 1); BindOrRestart(&End); } DEF_OP(ThreadRemoveCodeEntry) { PushDynamicRegs(TMP4); SpillStaticRegs(TMP4); // Arguments are passed as follows: // X0: Thread // X1: RIP mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, STATE.R()); // TODO: Relocations don't seem to be wired up to this...? LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, Entry, CPU::Arm64Emitter::PadType::AUTOPAD); ldr(ARMEmitter::XReg::x2, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.ThreadRemoveCodeEntryFromJIT)); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } else { blr(ARMEmitter::Reg::r2); } FillStaticRegs(); // Fix the stack and any values that were stepped on PopDynamicRegs(); } DEF_OP(CPUID) { auto Op = IROp->C(); isb(); mov(ARMEmitter::Size::i64Bit, TMP2, GetReg(Op->Function)); mov(ARMEmitter::Size::i64Bit, TMP3, GetReg(Op->Leaf)); PushDynamicRegs(TMP4); SpillStaticRegs(TMP4); // x0 = CPUID Handler // x1 = CPUID Function // x2 = CPUID Leaf ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.CPUIDObj)); ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.CPUIDFunction)); if (!TMP_ABIARGS) { mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, TMP2); mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, TMP3); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall<__uint128_t, void*, uint64_t, uint64_t>(ARMEmitter::Reg::r3); } else { blr(ARMEmitter::Reg::r3); } if (!TMP_ABIARGS) { mov(ARMEmitter::Size::i64Bit, TMP1, ARMEmitter::Reg::r0); mov(ARMEmitter::Size::i64Bit, TMP2, ARMEmitter::Reg::r1); } FillStaticRegs(); PopDynamicRegs(); // Results are in x0, x1 // Results want to be 4xi32 scalars mov(ARMEmitter::Size::i32Bit, GetReg(Op->OutEAX), TMP1); mov(ARMEmitter::Size::i32Bit, GetReg(Op->OutECX), TMP2); ubfx(ARMEmitter::Size::i64Bit, GetReg(Op->OutEBX), TMP1, 32, 32); ubfx(ARMEmitter::Size::i64Bit, GetReg(Op->OutEDX), TMP2, 32, 32); } DEF_OP(XGetBV) { auto Op = IROp->C(); PushDynamicRegs(TMP4); SpillStaticRegs(TMP4); mov(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, GetReg(Op->Function)); // x0 = CPUID Handler // x1 = XCR Function ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.CPUIDObj)); ldr(ARMEmitter::XReg::x2, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.XCRFunction)); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } else { blr(ARMEmitter::Reg::r2); } if (!TMP_ABIARGS) { mov(ARMEmitter::Size::i64Bit, TMP1, ARMEmitter::Reg::r0); } FillStaticRegs(); PopDynamicRegs(); // Results are in x0, need to split into i32 parts mov(ARMEmitter::Size::i32Bit, GetReg(Op->OutEAX), TMP1); ubfx(ARMEmitter::Size::i64Bit, GetReg(Op->OutEDX), TMP1, 32, 32); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/ConversionOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "Interface/Core/JIT/JITClass.h" #include "Interface/Context/Context.h" namespace FEXCore::CPU { DEF_OP(VInsGPR) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto DestIdx = Op->DestIdx; const auto ElementSize = Op->Header.ElementSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubEmitSize = ConvertSubRegSize8(IROp); const auto ElementsPer128Bit = IR::NumElements(IR::OpSize::i128Bit, ElementSize); const auto Dst = GetVReg(Node); const auto DestVector = GetVReg(Op->DestVector); const auto Src = GetReg(Op->Src); if (HostSupportsSVE256 && Is256Bit) { const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize); const auto Offset = ElementSizeBits * DestIdx; const auto SSEBitSize = Core::CPUState::XMM_SSE_REG_SIZE * 8; const auto InUpperLane = Offset >= SSEBitSize; // This is going to be a little gross. Pls forgive me. // Since SVE has the whole vector length agnostic programming // thing going on, we can't exactly freely insert entries into // arbitrary locations in the vector. // // SVE *does* have INSR, however this only shifts the entire // vector to the left by an element size and inserts a value // at the beginning of the vector. Not *quite* what we need. // (though INSR *is* very useful for other things). // // The idea is (in the case of the upper lane), move the upper // lane down, insert into it and recombine with the lower lane. // // In the case of the lower lane, insert and then recombine with // the upper lane. if (InUpperLane) { // Move the upper lane down for the insertion. const auto CompactPred = ARMEmitter::PReg::p0; not_(CompactPred, PRED_TMP_32B.Zeroing(), PRED_TMP_16B); compact(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), CompactPred, DestVector.Z()); } // Put data in place for destructive SPLICE below. mov(Dst.Z(), DestVector.Z()); // Inserts the GPR value into the given V register. // Also automatically adjusts the index in the case of using the // moved upper lane. const auto Insert = [&](const ARMEmitter::VRegister& reg, int index) { if (InUpperLane) { index -= ElementsPer128Bit; } ins(SubEmitSize, reg, index, Src); }; if (InUpperLane) { Insert(VTMP1, DestIdx); splice(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), PRED_TMP_16B, Dst.Z(), VTMP1.Z()); } else { Insert(Dst, DestIdx); splice(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), PRED_TMP_16B, Dst.Z(), DestVector.Z()); } } else { // No need to move if Dst and DestVector alias one another. if (Dst != DestVector) { mov(Dst.Q(), DestVector.Q()); } ins(SubEmitSize, Dst, DestIdx, Src); } } DEF_OP(VCastFromGPR) { auto Op = IROp->C(); auto Dst = GetVReg(Node); auto Src = GetReg(Op->Src); switch (Op->Header.ElementSize) { case IR::OpSize::i8Bit: uxtb(ARMEmitter::Size::i32Bit, TMP1, Src); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1); break; case IR::OpSize::i16Bit: uxth(ARMEmitter::Size::i32Bit, TMP1, Src); fmov(ARMEmitter::Size::i32Bit, Dst.S(), TMP1); break; case IR::OpSize::i32Bit: fmov(ARMEmitter::Size::i32Bit, Dst.S(), Src); break; case IR::OpSize::i64Bit: fmov(ARMEmitter::Size::i64Bit, Dst.D(), Src); break; default: LOGMAN_MSG_A_FMT("Unknown castGPR element size: {}", Op->Header.ElementSize); } } DEF_OP(VLoadTwoGPRs) { const auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto SrcLower = GetReg(Op->Lower); const auto SrcUpper = GetReg(Op->Upper); fmov(ARMEmitter::Size::i64Bit, Dst.D(), SrcLower); fmov(ARMEmitter::Size::i64Bit, Dst.D(), SrcUpper, true); } DEF_OP(VDupFromGPR) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Src = GetReg(Op->Src); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubEmitSize = ConvertSubRegSize8(IROp); if (HostSupportsSVE256 && Is256Bit) { dup(SubEmitSize, Dst.Z(), Src); } else { dup(SubEmitSize, Dst.Q(), Src); } } DEF_OP(Float_FromGPR_S) { const auto Op = IROp->C(); const uint16_t ElementSize = IR::OpSizeToSize(Op->Header.ElementSize); const uint16_t Conv = (ElementSize << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto Dst = GetVReg(Node); auto Src = GetReg(Op->Src); switch (Conv) { case 0x0204: { // Half <- int32_t scvtf(ARMEmitter::Size::i32Bit, Dst.H(), Src); break; } case 0x0208: { // Half <- int64_t scvtf(ARMEmitter::Size::i64Bit, Dst.H(), Src); break; } case 0x0404: { // Float <- int32_t scvtf(ARMEmitter::Size::i32Bit, Dst.S(), Src); break; } case 0x0408: { // Float <- int64_t scvtf(ARMEmitter::Size::i64Bit, Dst.S(), Src); break; } case 0x0804: { // Double <- int32_t scvtf(ARMEmitter::Size::i32Bit, Dst.D(), Src); break; } case 0x0808: { // Double <- int64_t scvtf(ARMEmitter::Size::i64Bit, Dst.D(), Src); break; } default: LOGMAN_MSG_A_FMT("Unhandled conversion mask: Mask=0x{:04x}, ElementSize={}, SrcElementSize={}", Conv, ElementSize, Op->SrcElementSize); break; } } DEF_OP(Float_FToF) { auto Op = IROp->C(); const uint16_t Conv = (IR::OpSizeToSize(Op->Header.ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto Dst = GetVReg(Node); auto Src = GetVReg(Op->Scalar); switch (Conv) { case 0x0204: { // Half <- Float fcvt(Dst.H(), Src.S()); break; } case 0x0208: { // Half <- Double fcvt(Dst.H(), Src.D()); break; } case 0x0402: { // Float <- Half fcvt(Dst.S(), Src.H()); break; } case 0x0802: { // Double <- Half fcvt(Dst.D(), Src.H()); break; } case 0x0804: { // Double <- Float fcvt(Dst.D(), Src.S()); break; } case 0x0408: { // Float <- Double fcvt(Dst.S(), Src.D()); break; } default: LOGMAN_MSG_A_FMT("Unknown FCVT sizes: 0x{:x}", Conv); } } DEF_OP(Vector_SToF) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B; scvtf(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize); } else { if (OpSize == ElementSize) { if (ElementSize == IR::OpSize::i64Bit) { scvtf(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D()); } else if (ElementSize == IR::OpSize::i32Bit) { scvtf(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S()); } else { scvtf(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H()); } } else { if (OpSize == IR::OpSize::i64Bit) { scvtf(SubEmitSize, Dst.D(), Vector.D()); } else { scvtf(SubEmitSize, Dst.Q(), Vector.Q()); } } } } DEF_OP(Vector_FToZS) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B; fcvtzs(Dst.Z(), SubEmitSize, Mask.Merging(), Vector.Z(), SubEmitSize); } else { if (OpSize == ElementSize) { if (ElementSize == IR::OpSize::i64Bit) { fcvtzs(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Vector.D()); } else if (ElementSize == IR::OpSize::i32Bit) { fcvtzs(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Vector.S()); } else { fcvtzs(ARMEmitter::ScalarRegSize::i16Bit, Dst.H(), Vector.H()); } } else { if (OpSize == IR::OpSize::i64Bit) { fcvtzs(SubEmitSize, Dst.D(), Vector.D()); } else { fcvtzs(SubEmitSize, Dst.Q(), Vector.Q()); } } } } DEF_OP(Vector_FToS) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B; frinti(SubEmitSize, Dst.Z(), Mask.Merging(), Vector.Z()); fcvtzs(Dst.Z(), SubEmitSize, Mask.Merging(), Dst.Z(), SubEmitSize); } else { const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (OpSize == IR::OpSize::i64Bit) { frinti(SubEmitSize, Dst.D(), Vector.D()); fcvtzs(SubEmitSize, Dst.D(), Dst.D()); } else { frinti(SubEmitSize, Dst.Q(), Vector.Q()); fcvtzs(SubEmitSize, Dst.Q(), Dst.Q()); } } } DEF_OP(Vector_FToF) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Conv = (IR::OpSizeToSize(ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { // Curiously, FCVTLT and FCVTNT have no bottom variants, // and also interesting is that FCVTLT will iterate the // source vector by accessing each odd element and storing // them consecutively in the destination. // // FCVTNT is somewhat like the opposite. It will read each // consecutive element, but store each result into every odd // element in the destination vector. // // We need to undo the behavior of FCVTNT with UZP2. In the case // of FCVTLT, we instead need to set the vector up with ZIP1, so // that the elements will be processed correctly. const auto Mask = PRED_TMP_32B.Merging(); switch (Conv) { case 0x0402: { // Float <- Half zip1(ARMEmitter::SubRegSize::i16Bit, Dst.Z(), Vector.Z(), Vector.Z()); fcvtlt(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Mask, Dst.Z()); break; } case 0x0804: { // Double <- Float zip1(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Vector.Z(), Vector.Z()); fcvtlt(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Dst.Z()); break; } case 0x0204: { // Half <- Float fcvtnt(ARMEmitter::SubRegSize::i16Bit, Dst.Z(), Mask, Vector.Z()); uzp2(ARMEmitter::SubRegSize::i16Bit, Dst.Z(), Dst.Z(), Dst.Z()); break; } case 0x0408: { // Float <- Double fcvtnt(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Mask, Vector.Z()); uzp2(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Dst.Z(), Dst.Z()); break; } default: LOGMAN_MSG_A_FMT("Unknown Vector_FToF Type : 0x{:04x}", Conv); break; } } else { switch (Conv) { case 0x0402: // Float <- Half case 0x0804: { // Double <- Float fcvtl(SubEmitSize, Dst.D(), Vector.D()); break; } case 0x0204: // Half <- Float case 0x0408: { // Float <- Double fcvtn(SubEmitSize, Dst.D(), Vector.D()); break; } default: LOGMAN_MSG_A_FMT("Unknown Vector_FToF Type : 0x{:04x}", Conv); break; } } } DEF_OP(VFCVTL2) { const auto Op = IROp->C(); const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); fcvtl2(SubEmitSize, Dst.D(), Vector.D()); } DEF_OP(VFCVTN2) { const auto Op = IROp->C(); const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); const auto VectorUpper = GetVReg(Op->VectorUpper); auto Lower = VectorLower; if (Dst != VectorLower) { mov(VTMP1.Q(), VectorLower.Q()); Lower = VTMP1; } fcvtn2(SubEmitSize, Lower.Q(), VectorUpper.Q()); if (Dst != VectorLower) { mov(Dst.Q(), Lower.Q()); } } DEF_OP(Vector_FToI) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); switch (Op->Round) { case IR::RoundMode::Nearest: frintn(SubEmitSize, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::NegInfinity: frintm(SubEmitSize, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::PosInfinity: frintp(SubEmitSize, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::TowardsZero: frintz(SubEmitSize, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::Host: frinti(SubEmitSize, Dst.Z(), Mask, Vector.Z()); break; } } else { const auto IsScalar = ElementSize == OpSize; if (IsScalar) { // Since we have multiple overloads of the same name (e.g. // frinti having AdvSIMD, AdvSIMD scalar, and an SVE version), // we can't just use a lambda without some seriously ugly casting. // This is fairly self-contained otherwise. #define ROUNDING_FN(name) \ if (ElementSize == IR::OpSize::i16Bit) { \ name(Dst.H(), Vector.H()); \ } else if (ElementSize == IR::OpSize::i32Bit) { \ name(Dst.S(), Vector.S()); \ } else if (ElementSize == IR::OpSize::i64Bit) { \ name(Dst.D(), Vector.D()); \ } else { \ FEX_UNREACHABLE; \ } switch (Op->Round) { case IR::RoundMode::Nearest: ROUNDING_FN(frintn); break; case IR::RoundMode::NegInfinity: ROUNDING_FN(frintm); break; case IR::RoundMode::PosInfinity: ROUNDING_FN(frintp); break; case IR::RoundMode::TowardsZero: ROUNDING_FN(frintz); break; case IR::RoundMode::Host: ROUNDING_FN(frinti); break; } #undef ROUNDING_FN } else { switch (Op->Round) { case IR::RoundMode::Nearest: frintn(SubEmitSize, Dst.Q(), Vector.Q()); break; case IR::RoundMode::NegInfinity: frintm(SubEmitSize, Dst.Q(), Vector.Q()); break; case IR::RoundMode::PosInfinity: frintp(SubEmitSize, Dst.Q(), Vector.Q()); break; case IR::RoundMode::TowardsZero: frintz(SubEmitSize, Dst.Q(), Vector.Q()); break; case IR::RoundMode::Host: frinti(SubEmitSize, Dst.Q(), Vector.Q()); break; } } } } DEF_OP(Vector_FToISized) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubEmitSize = ConvertSubRegSize248(IROp); LOGMAN_THROW_A_FMT(IROp->Size != IR::OpSize::i256Bit, "256-bit not wired up, though we could change that"); LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFRINTTS, "Need FRINTTS for Vector_FToISized"); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (ElementSize == IROp->Size) { // See above #define ROUNDING_FN(name) \ if (ElementSize == IR::OpSize::i32Bit) { \ name(Dst.S(), Vector.S()); \ } else if (ElementSize == IR::OpSize::i64Bit) { \ name(Dst.D(), Vector.D()); \ } else { \ FEX_UNREACHABLE; \ } if (Op->IntSize == IR::OpSize::i64Bit) { if (Op->HostRound) { ROUNDING_FN(frint64x); } else { ROUNDING_FN(frint64z); } } else { if (Op->HostRound) { ROUNDING_FN(frint32x); } else { ROUNDING_FN(frint32z); } } #undef ROUNDING_FN } else { if (Op->IntSize == IR::OpSize::i64Bit) { if (Op->HostRound) { frint64x(SubEmitSize, Dst.Q(), Vector.Q()); } else { frint64z(SubEmitSize, Dst.Q(), Vector.Q()); } } else { if (Op->HostRound) { frint32x(SubEmitSize, Dst.Q(), Vector.Q()); } else { frint32z(SubEmitSize, Dst.Q(), Vector.Q()); } } } } DEF_OP(Vector_F64ToI32) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Round = Op->Round; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE128 || HostSupportsSVE256) { const auto Mask = Is256Bit ? PRED_TMP_32B.Merging() : PRED_TMP_16B.Merging(); // First step is to round the f64 values to integrals (frint*) // Then convert to integers using fcvtzs. auto CVTReg = Dst.Z(); switch (Round) { case IR::RoundMode::Nearest: frintn(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::NegInfinity: frintm(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::PosInfinity: frintp(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; case IR::RoundMode::TowardsZero: CVTReg = Vector.Z(); break; case IR::RoundMode::Host: frinti(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } fcvtzs(Dst.Z(), ARMEmitter::SubRegSize::i32Bit, Mask, CVTReg, ARMEmitter::SubRegSize::i64Bit); ///< Fixup format of register that fcvtzs returns. uzp1(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Dst.Z(), Dst.Z()); if (Op->EnsureZeroUpperHalf) { ///< Match CVTPD2DQ/CVTTPD2DQ behaviour if necessary by zeroing the upper bits here. if (Is256Bit) { mov(Dst.Q(), Dst.Q()); } else { mov(Dst.D(), Dst.D()); } } } else { // This has a known precision issue that isn't easily resolvable without throwing away performance. // Doing the conversion in multi-stage steps has an issue that you can lose precision in the f32->i32 step if your source was f64. // To get around this with ASIMD FEX needs to use fcvtzs (Scalar, Integer, to GPR) for each F64 to be directly converted to i32. // This is a very costly transform that the SVE path doesn't need to do since it supports f64->i32 directly. // If this precision issue is necessary then we can add an option for it in the future. ///< Round float to integral depending on rounding mode. switch (Round) { case IR::RoundMode::Nearest: frintn(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), Vector.Q()); break; case IR::RoundMode::NegInfinity: frintm(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), Vector.Q()); break; case IR::RoundMode::PosInfinity: frintp(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), Vector.Q()); break; case IR::RoundMode::TowardsZero: frintz(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), Vector.Q()); break; case IR::RoundMode::Host: frinti(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), Vector.Q()); break; } // Now narrow from f64 to f32. fcvtn(ARMEmitter::SubRegSize::i32Bit, Dst.Q(), Dst.Q()); ///< Convert the two F32 integrals to real integers. fcvtzs(ARMEmitter::SubRegSize::i32Bit, Dst.D(), Dst.D()); } } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/DebugData.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include namespace FEXCore::CPU { union Relocation; } // namespace FEXCore::CPU namespace FEXCore::Core { struct DebugDataSubblock { uint32_t HostCodeOffset; uint32_t HostCodeSize; }; struct DebugDataGuestOpcode { uint64_t GuestEntryOffset; ptrdiff_t HostEntryOffset; }; /** * @brief Contains debug data for a block of code for later debugger analysis * * Needs to remain around for as long as the code could be executed at least */ struct DebugData : public FEXCore::Allocator::FEXAllocOperators { uint64_t HostCodeSize; ///< The size of the code generated in the host JIT fextl::vector Subblocks; fextl::vector GuestOpcodes; fextl::vector* Relocations; }; } // namespace FEXCore::Core ================================================ FILE: FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "Interface/Core/JIT/JITClass.h" namespace FEXCore::CPU { DEF_OP(VAESImc) { auto Op = IROp->C(); aesimc(GetVReg(Node), GetVReg(Op->Vector)); } DEF_OP(VAESEnc) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Key = GetVReg(Op->Key); const auto State = GetVReg(Op->State); const auto ZeroReg = GetVReg(Op->ZeroReg); LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. // This matches the common case of XMM AES. aese(Dst.Q(), ZeroReg.Q()); aesmc(Dst.Q(), Dst.Q()); eor(Dst.Q(), Dst.Q(), Key.Q()); } else { mov(VTMP1.Q(), State.Q()); aese(VTMP1, ZeroReg.Q()); aesmc(VTMP1, VTMP1); eor(Dst.Q(), VTMP1.Q(), Key.Q()); } } DEF_OP(VAESEncLast) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Key = GetVReg(Op->Key); const auto State = GetVReg(Op->State); const auto ZeroReg = GetVReg(Op->ZeroReg); LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. // This matches the common case of XMM AES. aese(Dst.Q(), ZeroReg.Q()); eor(Dst.Q(), Dst.Q(), Key.Q()); } else { mov(VTMP1.Q(), State.Q()); aese(VTMP1, ZeroReg.Q()); eor(Dst.Q(), VTMP1.Q(), Key.Q()); } } DEF_OP(VAESDec) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Key = GetVReg(Op->Key); const auto State = GetVReg(Op->State); const auto ZeroReg = GetVReg(Op->ZeroReg); LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. // This matches the common case of XMM AES. aesd(Dst.Q(), ZeroReg.Q()); aesimc(Dst.Q(), Dst.Q()); eor(Dst.Q(), Dst.Q(), Key.Q()); } else { mov(VTMP1.Q(), State.Q()); aesd(VTMP1, ZeroReg.Q()); aesimc(VTMP1, VTMP1); eor(Dst.Q(), VTMP1.Q(), Key.Q()); } } DEF_OP(VAESDecLast) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Key = GetVReg(Op->Key); const auto State = GetVReg(Op->State); const auto ZeroReg = GetVReg(Op->ZeroReg); LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); if (Dst == State && Dst != Key) { // Optimal case in which Dst already contains the starting state. // This matches the common case of XMM AES. aesd(Dst.Q(), ZeroReg.Q()); eor(Dst.Q(), Dst.Q(), Key.Q()); } else { mov(VTMP1.Q(), State.Q()); aesd(VTMP1, ZeroReg.Q()); eor(Dst.Q(), VTMP1.Q(), Key.Q()); } } DEF_OP(VAESKeyGenAssist) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src = GetVReg(Op->Src); const auto Swizzle = GetVReg(Op->KeyGenTBLSwizzle); auto ZeroReg = GetVReg(Op->ZeroReg); if (Dst == ZeroReg) { // Seriously? ZeroReg ended up being the destination register? // Just copy it over in this case... mov(VTMP1.Q(), ZeroReg.Q()); ZeroReg = VTMP1; } if (Dst != Src) { mov(Dst.Q(), Src.Q()); } // Do a "regular" AESE step aese(Dst, ZeroReg.Q()); // Now EOR in the RCON if (Op->RCON) { tbl(Dst.Q(), Dst.Q(), Swizzle.Q()); LoadConstant(ARMEmitter::Size::i64Bit, TMP1, static_cast(Op->RCON) << 32); dup(ARMEmitter::SubRegSize::i64Bit, VTMP2.Q(), TMP1); eor(Dst.Q(), Dst.Q(), VTMP2.Q()); } else { tbl(Dst.Q(), Dst.Q(), Swizzle.Q()); } } DEF_OP(CRC32) { auto Op = IROp->C(); const auto Dst = GetReg(Node); const auto Src1 = GetReg(Op->Src1); const auto Src2 = GetReg(Op->Src2); switch (Op->SrcSize) { case IR::OpSize::i8Bit: crc32cb(Dst.W(), Src1.W(), Src2.W()); break; case IR::OpSize::i16Bit: crc32ch(Dst.W(), Src1.W(), Src2.W()); break; case IR::OpSize::i32Bit: crc32cw(Dst.W(), Src1.W(), Src2.W()); break; case IR::OpSize::i64Bit: crc32cx(Dst.X(), Src1.X(), Src2.X()); break; default: LOGMAN_MSG_A_FMT("Unknown CRC32 size: {}", Op->SrcSize); } } DEF_OP(VSha1H) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src = GetVReg(Op->Src); sha1h(Dst.S(), Src.S()); } DEF_OP(VSha1C) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); const auto Src3 = GetVReg(Op->Src3); if (Dst == Src1) { sha1c(Dst, Src2.S(), Src3); } else if (Dst != Src2 && Dst != Src3) { mov(Dst.Q(), Src1.Q()); sha1c(Dst, Src2.S(), Src3); } else { mov(VTMP1.Q(), Src1.Q()); sha1c(VTMP1, Src2.S(), Src3); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha1M) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); const auto Src3 = GetVReg(Op->Src3); if (Dst == Src1) { sha1m(Dst, Src2.S(), Src3); } else if (Dst != Src2 && Dst != Src3) { mov(Dst.Q(), Src1.Q()); sha1m(Dst, Src2.S(), Src3); } else { mov(VTMP1.Q(), Src1.Q()); sha1m(VTMP1, Src2.S(), Src3); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha1P) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); const auto Src3 = GetVReg(Op->Src3); if (Dst == Src1) { sha1p(Dst, Src2.S(), Src3); } else if (Dst != Src2 && Dst != Src3) { mov(Dst.Q(), Src1.Q()); sha1p(Dst, Src2.S(), Src3); } else { mov(VTMP1.Q(), Src1.Q()); sha1p(VTMP1, Src2.S(), Src3); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha1SU1) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); if (Dst == Src1) { sha1su1(Dst, Src2); } else if (Dst != Src2) { mov(Dst.Q(), Src1.Q()); sha1su1(Dst, Src2); } else { mov(VTMP1.Q(), Src1.Q()); sha1su1(VTMP1, Src2); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha256H) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); const auto Src3 = GetVReg(Op->Src3); if (Dst == Src1) { sha256h(Dst, Src2, Src3); } else if (Dst != Src2 && Dst != Src3) { mov(Dst.Q(), Src1.Q()); sha256h(Dst, Src2, Src3); } else { mov(VTMP1.Q(), Src1.Q()); sha256h(VTMP1, Src2, Src3); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha256H2) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); const auto Src3 = GetVReg(Op->Src3); if (Dst == Src1) { sha256h2(Dst, Src2, Src3); } else if (Dst != Src2 && Dst != Src3) { mov(Dst.Q(), Src1.Q()); sha256h2(Dst, Src2, Src3); } else { mov(VTMP1.Q(), Src1.Q()); sha256h2(VTMP1, Src2, Src3); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha256U0) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); if (Dst == Src1) { sha256su0(Dst, Src2); } else { mov(VTMP1.Q(), Src1.Q()); sha256su0(VTMP1, Src2); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(VSha256U1) { auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); if (Dst != Src1 && Dst != Src2) { movi(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0); sha256su1(Dst, Src1, Src2); } else { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); sha256su1(VTMP1, Src1, Src2); mov(Dst.Q(), VTMP1.Q()); } } DEF_OP(PCLMUL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Src1 = GetVReg(Op->Src1); const auto Src2 = GetVReg(Op->Src2); LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Currently only supports 128-bit operations."); switch (Op->Selector) { case 0b00000000: pmull(ARMEmitter::SubRegSize::i128Bit, Dst.D(), Src1.D(), Src2.D()); break; case 0b00000001: dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), Src1.Q(), 1); pmull(ARMEmitter::SubRegSize::i128Bit, Dst.D(), VTMP1.D(), Src2.D()); break; case 0b00010000: dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), Src2.Q(), 1); pmull(ARMEmitter::SubRegSize::i128Bit, Dst.D(), VTMP1.D(), Src1.D()); break; case 0b00010001: pmull2(ARMEmitter::SubRegSize::i128Bit, Dst.Q(), Src1.Q(), Src2.Q()); break; default: LOGMAN_MSG_A_FMT("Unknown PCLMUL selector: {}", Op->Selector); break; } } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/JIT.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ glossary: Splatter ~ a code generator backend that concatenates configurable macros instead of doing isel glossary: IR ~ Intermediate Representation, our high-level opcode representation, loosely modeling arm64 glossary: SSA ~ Single Static Assignment, a form of representing IR in memory glossary: Basic Block ~ A block of instructions with no control flow, terminated by control flow glossary: Fragment ~ A Collection of basic blocks, possibly an entire guest function or a subset of it tags: backend|arm64 desc: Main glue logic of the arm64 splatter backend $end_info$ */ #include "Interface/Context/Context.h" #include "Interface/Core/LookupCache.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/Interpreter/InterpreterOps.h" #include "Interface/Core/JIT/DebugData.h" #include "Interface/Core/JIT/JITClass.h" #include "Interface/IR/Passes/RegisterAllocationPass.h" #include "Utils/MemberFunctionToPointer.h" #include "Utils/variable_length_integer.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace { struct DivRem { uint64_t Quotient; uint64_t Remainder; }; static struct DivRem LUDIV(uint64_t SrcHigh, uint64_t SrcLow, uint64_t Divisor) { __uint128_t Source = (static_cast<__uint128_t>(SrcHigh) << 64) | SrcLow; return { .Quotient = (uint64_t)(Source / Divisor), .Remainder = (uint64_t)(Source % Divisor), }; } static struct DivRem LDIV(uint64_t SrcHigh, uint64_t SrcLow, int64_t Divisor) { __int128_t Source = (static_cast<__uint128_t>(SrcHigh) << 64) | SrcLow; return { .Quotient = (uint64_t)(Source / Divisor), .Remainder = (uint64_t)(Source % Divisor), }; } static void PrintValue(uint64_t Value) { LogMan::Msg::DFmt("Value: 0x{:x}", Value); } static void PrintVectorValue(uint64_t Value, uint64_t ValueUpper) { LogMan::Msg::DFmt("Value: 0x{:016x}'{:016x}", ValueUpper, Value); } } // namespace namespace FEXCore::CPU { void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::Ref Node) { FallbackInfo Info; if (!InterpreterOps::GetFallbackHandler(IROp, &Info)) { #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_MSG_A_FMT("Unhandled IR Op: {}", FEXCore::IR::GetName(IROp->Op)); #endif } else { auto FillF80x2Result = [&](auto DstLo, auto DstHi) { mov(DstLo.Q(), VTMP1.Q()); mov(DstHi.Q(), VTMP2.Q()); }; auto FillF64x2Result = [&](auto DstLo, auto DstHi) { fmov(DstLo.D(), VTMP1.D()); fmov(DstHi.D(), VTMP2.D()); }; auto FillF80Result = [&]() { const auto Dst = GetVReg(Node); mov(Dst.Q(), VTMP1.Q()); }; auto FillF64Result = [&]() { const auto Dst = GetVReg(Node); fmov(Dst.D(), VTMP1.D()); }; auto FillF32Result = [&]() { const auto Dst = GetVReg(Node); fmov(Dst.S(), VTMP1.S()); }; auto FillI64Result = [&]() { const auto Dst = GetReg(Node); mov(Dst.X(), TMP1); }; auto FillI32Result = [&]() { const auto Dst = GetReg(Node); mov(Dst.W(), TMP1.W()); }; auto FillI16Result = [&]() { const auto Dst = GetReg(Node); mov(Dst.W(), TMP1.W()); }; switch (Info.ABI) { case FABI_F80_I16_F32_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); fmov(VTMP1.S(), Src1.S()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF80Result(); } break; case FABI_F80_I16_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); fmov(VTMP1.D(), Src1.D()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF80Result(); } break; case FABI_F80_I16_I16_PTR: case FABI_F80_I16_I32_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // tmp2 (x1/x11): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetReg(IROp->Args[0]); // Need to sign or zero extend this for the dispatcher handler. if (Info.ABI == FABI_F80_I16_I16_PTR) { sxth(ARMEmitter::Size::i32Bit, TMP2, Src1); } else { mov(ARMEmitter::Size::i32Bit, TMP2, Src1); } ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF80Result(); } break; case FABI_F32_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF32Result(); } break; case FABI_F64_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF64Result(); } break; case FABI_F64_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); fmov(VTMP1.D(), Src1.D()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF64Result(); } break; case FABI_F64x2_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source // vtmp2 (v1/v16): vector source #ifdef VIXL_SIMULATOR LOGMAN_THROW_A_FMT(CTX->Config.DisableVixlIndirectCalls, "Vector register pairs unsupported by simulator currently"); #endif str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); const auto DstLo = GetVReg(IROp->Args[1]); const auto DstHi = GetVReg(IROp->Args[2]); fmov(VTMP1.D(), Src1.D()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF64x2Result(DstLo, DstHi); } break; case FABI_F64_F64_F64_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); const auto Src2 = GetVReg(IROp->Args[1]); fmov(VTMP1.D(), Src1.D()); fmov(VTMP2.D(), Src2.D()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF64Result(); } break; case FABI_I16_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillI16Result(); } break; case FABI_I32_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillI32Result(); } break; case FABI_I64_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): source str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillI64Result(); } break; case FABI_I64_I16_F80_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); const auto Src2 = GetVReg(IROp->Args[1]); mov(VTMP1.Q(), Src1.Q()); mov(VTMP2.Q(), Src2.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillI64Result(); } break; case FABI_F80_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF80Result(); } break; case FABI_F80x2_I16_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v16): vector source 2 #ifdef VIXL_SIMULATOR LOGMAN_THROW_A_FMT(CTX->Config.DisableVixlIndirectCalls, "Vector register pairs unsupported by simulator currently"); #endif str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); const auto DstLo = GetVReg(IROp->Args[1]); const auto DstHi = GetVReg(IROp->Args[2]); mov(VTMP1.Q(), Src1.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF80x2Result(DstLo, DstHi); } break; case FABI_F80_I16_F80_F80_PTR: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(IROp->Args[0]); const auto Src2 = GetVReg(IROp->Args[1]); mov(VTMP1.Q(), Src1.Q()); mov(VTMP2.Q(), Src2.Q()); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillF80Result(); } break; case FABI_I32_I64_I64_V128_V128_I16: { // Linux Reg/Win32 Reg: // stack: FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 // tmp1 (x0/x10): source 1 // tmp2 (x1/x11): source 2 // tmp3 (x2/x12): source 3 const auto Op = IROp->C(); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP1, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); stp(TMP1, ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto SrcRAX = GetReg(Op->RAX); const auto SrcRDX = GetReg(Op->RDX); const auto Control = Op->Control; mov(TMP1, SrcRAX.X()); mov(TMP2, SrcRDX.X()); movz(ARMEmitter::Size::i32Bit, TMP3, Control); const auto Src1 = GetVReg(Op->LHS); const auto Src2 = GetVReg(Op->RHS); mov(VTMP1.Q(), Src1.Q()); mov(VTMP2.Q(), Src2.Q()); blr(TMP4); ldp(ARMEmitter::XReg::zr, ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillI32Result(); } break; case FABI_I32_V128_V128_I16: { // Linux Reg/Win32 Reg: // tmp4 (x4/x13): FallbackHandler // x30: return // vtmp1 (v0/v16): vector source 1 // vtmp2 (v1/v17): vector source 2 // tmp1 (x0/x10): source 1 const auto Op = IROp->C(); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); const auto Src1 = GetVReg(Op->LHS); const auto Src2 = GetVReg(Op->RHS); const auto Control = Op->Control; mov(VTMP1.Q(), Src1.Q()); mov(VTMP2.Q(), Src2.Q()); movz(ARMEmitter::Size::i32Bit, TMP1, Control); ldr(TMP2, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, ABIHandler)); ldr(TMP4, FALLBACK_HANDLER_OFFSET(Info.HandlerIndex, Func)); blr(TMP2); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); FillI32Result(); } break; case FABI_UNKNOWN: default: #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_MSG_A_FMT("Unhandled IR Fallback ABI: {} {}", FEXCore::IR::GetName(IROp->Op), ToUnderlying(Info.ABI)); #endif break; } } } static void DirectBlockDelinker(FEXCore::Context::ExitFunctionLinkData* Record, bool Call) { uintptr_t JumpThunkStartAddress = reinterpret_cast(Record) - 0x10; uintptr_t CallerAddress = JumpThunkStartAddress + Record->CallerOffset; auto BranchOffset = JumpThunkStartAddress / 4 - CallerAddress / 4; // Replace the patched callsite with a branch to the jump thunk. uint32_t BranchInst = 0; ARMEmitter::Emitter BranchEmit(reinterpret_cast(&BranchInst), 4); if (Call) { BranchEmit.bl(BranchOffset); } else { BranchEmit.b(BranchOffset); } std::atomic_ref(*reinterpret_cast(CallerAddress)).store(BranchInst, std::memory_order::relaxed); ARMEmitter::Emitter::ClearICache(reinterpret_cast(CallerAddress), 4); } static void IndirectBlockDelinker(FEXCore::Context::ExitFunctionLinkData* Record) { uintptr_t JumpThunkStartAddress = reinterpret_cast(Record) - 0x10; uint32_t BranchInst = 0; ARMEmitter::Emitter BranchEmit(reinterpret_cast(&BranchInst), 4); // Restore branch +2 instructions to jump to the linker block BranchEmit.b(0x2); std::atomic_ref(*reinterpret_cast(JumpThunkStartAddress)).store(BranchInst, std::memory_order::relaxed); ARMEmitter::Emitter::ClearICache(reinterpret_cast(JumpThunkStartAddress), 4); // No need to reset HostCode here as the exit linker pointer is stored separately, and if the block is relinked it will be updated. } uint64_t Arm64JITCore::ExitFunctionLink(FEXCore::Core::CpuStateFrame* Frame, FEXCore::Context::ExitFunctionLinkData* Record) { auto Thread = Frame->Thread; bool TFSet = Thread->CurrentFrame->State.flags[X86State::RFLAG_TF_RAW_LOC]; uintptr_t HostCode {}; auto GuestRip = Record->GuestRIP; if (TFSet) { // If TF is set, the cache must be skipped as different code needs to be generated. Frame->State.rip = GuestRip; return Frame->Pointers.DispatcherLoopTop; } else { { // Guard the LookupCache lock with the code invalidation mutex, to avoid issues with forking auto lk_inval = GuardSignalDeferringSection(static_cast(Thread->CTX)->CodeInvalidationMutex, Thread); HostCode = Thread->LookupCache->FindBlock(Thread, GuestRip); } if (!HostCode) { // Hold a reference to the code buffer, to avoid linking unmapped code if compilation triggers a recreation. auto CodeBuffer = static_cast(Thread->CPUBackend.get())->CurrentCodeBuffer; HostCode = static_cast(Thread->CTX)->CompileBlock(Frame, GuestRip, 0); if (Thread->LookupCache->Shared != CodeBuffer->LookupCache.get()) { return HostCode; } } } // See ExitFunction in BranchOps.cpp for an assembly level view of the handled cases. uintptr_t JumpThunkStartAddress = reinterpret_cast(Record) - 0x10; uintptr_t CallerAddress = JumpThunkStartAddress + Record->CallerOffset; auto BranchOffset = HostCode / 4 - CallerAddress / 4; uint32_t ExpectedKnownCallMarkerInst = 0; ARMEmitter::Emitter ExpectedKnownCallMarkerEmit(reinterpret_cast(&ExpectedKnownCallMarkerInst), 4); ExpectedKnownCallMarkerEmit.adr(TMP1, 0xC); // Guard the LookupCache lock with the code invalidation mutex, to avoid issues with forking auto lk_inval = GuardSignalDeferringSection(static_cast(Thread->CTX)->CodeInvalidationMutex, Thread); // Lock here is necessary to prevent simultaneous linking and delinking auto lk = Thread->LookupCache->AcquireWriteLock(); // For non-calls, this would extend into the block's code, however that's fine as an out-of-range adr would never // be generated avoiding any false positives. uintptr_t KnownCallMarkerAddr = CallerAddress - 0x8; uint32_t KnownCallMarkerInst = *reinterpret_cast(KnownCallMarkerAddr); if (ARMEmitter::Emitter::IsInt26(BranchOffset)) { // Directly patch the callsite with the appropriate branch instruction. uint32_t BranchInst = 0; ARMEmitter::Emitter BranchEmit(reinterpret_cast(&BranchInst), 4); if (KnownCallMarkerInst == ExpectedKnownCallMarkerInst) { BranchEmit.bl(BranchOffset); Thread->LookupCache->AddBlockLink( GuestRip, Record, [](FEXCore::Context::ExitFunctionLinkData* Record) { DirectBlockDelinker(Record, true); }, lk); } else { BranchEmit.b(BranchOffset); Thread->LookupCache->AddBlockLink( GuestRip, Record, [](FEXCore::Context::ExitFunctionLinkData* Record) { DirectBlockDelinker(Record, false); }, lk); } std::atomic_ref(*reinterpret_cast(CallerAddress)).store(BranchInst, std::memory_order::relaxed); ARMEmitter::Emitter::ClearICache(reinterpret_cast(CallerAddress), 4); } else { // This case is common between calls and jumps as the thunk callsite can be left untouched. std::atomic_ref(Record->HostCode).store(HostCode, std::memory_order::seq_cst); #ifdef ARCHITECTURE_arm64 // Make memory write visible to other threads reading the same location asm volatile("dc cvau, %0; dsb ish" : : "r"(Record->HostCode) :); #endif uint32_t LdrInst = 0; ARMEmitter::Emitter LdrEmit(reinterpret_cast(&LdrInst), 4); LdrEmit.ldr(TMP1, reinterpret_cast(&Record->HostCode) - JumpThunkStartAddress); std::atomic_ref(*reinterpret_cast(JumpThunkStartAddress)).store(LdrInst, std::memory_order::relaxed); ARMEmitter::Emitter::ClearICache(reinterpret_cast(JumpThunkStartAddress), 4); Thread->LookupCache->AddBlockLink(GuestRip, Record, IndirectBlockDelinker, lk); } return HostCode; } void Arm64JITCore::Op_NoOp(const IR::IROp_Header* IROp, IR::Ref Node) {} Arm64JITCore::Arm64JITCore(FEXCore::Context::ContextImpl* ctx, FEXCore::Core::InternalThreadState* Thread) : CPUBackend(*ctx, Thread) , Arm64Emitter(ctx) , HostSupportsSVE128 {ctx->HostFeatures.SupportsSVE128} , HostSupportsSVE256 {ctx->HostFeatures.SupportsSVE256} , HostSupportsAVX256 {ctx->HostFeatures.SupportsAVX && ctx->HostFeatures.SupportsSVE256} , HostSupportsRPRES {ctx->HostFeatures.SupportsRPRES} , HostSupportsAFP {ctx->HostFeatures.SupportsAFP} , CTX {ctx} , TempAllocator(ctx->CPUBackendAllocator, 0) { RAPass = Thread->PassManager->GetPass("RA"); RAPass->AddRegisters(IR::RegClass::GPR, GeneralRegisters.size()); RAPass->AddRegisters(IR::RegClass::GPRFixed, StaticRegisters.size()); RAPass->AddRegisters(IR::RegClass::FPR, GeneralFPRegisters.size()); RAPass->AddRegisters(IR::RegClass::FPRFixed, StaticFPRegisters.size()); RAPass->PairRegs = PairRegisters; { // Set up pointers that the JIT needs to load // Common auto& Ptrs = ThreadState->CurrentFrame->Pointers; Ptrs.PrintValue = reinterpret_cast(PrintValue); Ptrs.PrintVectorValue = reinterpret_cast(PrintVectorValue); Ptrs.ThreadRemoveCodeEntryFromJIT = reinterpret_cast(&Context::ContextImpl::ThreadRemoveCodeEntryFromJit); Ptrs.MonoBackpatcherWrite = reinterpret_cast(&Context::ContextImpl::MonoBackpatcherWrite); Ptrs.CPUIDObj = reinterpret_cast(&CTX->CPUID); { FEXCore::Utils::MemberFunctionToPointerCast PMF(&FEXCore::CPUIDEmu::RunFunction); Ptrs.CPUIDFunction = PMF.GetConvertedPointer(); } { FEXCore::Utils::MemberFunctionToPointerCast PMF(&FEXCore::CPUIDEmu::RunXCRFunction); Ptrs.XCRFunction = PMF.GetConvertedPointer(); } { FEXCore::Utils::MemberFunctionToPointerCast PMF(&FEXCore::HLE::SyscallHandler::HandleSyscall); Ptrs.SyscallHandlerObj = reinterpret_cast(CTX->SyscallHandler); Ptrs.SyscallHandlerFunc = PMF.GetVTableEntry(CTX->SyscallHandler); } Ptrs.ExitFunctionLink = reinterpret_cast(&Arm64JITCore::ExitFunctionLink); Ptrs.LUDIV = reinterpret_cast(LUDIV); Ptrs.LDIV = reinterpret_cast(LDIV); } CurrentCodeBuffer = CodeBuffers.GetLatest(); ThreadState->LookupCache->Shared = CurrentCodeBuffer->LookupCache.get(); } void Arm64JITCore::EmitDetectionString() { const char JITString[] = "FEXJIT::Arm64JITCore::"; EmitString(JITString); Align(); } void Arm64JITCore::ClearCache() { // NOTE: Holding on to the reference here is required to ensure validity of the WriteLock mutex auto PrevCodeBuffer = CurrentCodeBuffer; auto lk = PrevCodeBuffer->LookupCache->AcquireWriteLock(); auto CodeBuffer = GetEmptyCodeBuffer(); SetBuffer(CodeBuffer->Ptr, CodeBuffer->AllocatedSize); EmitDetectionString(); ThreadState->LookupCache->ChangeGuestToHostMapping(*PrevCodeBuffer, *CurrentCodeBuffer->LookupCache, lk); } Arm64JITCore::~Arm64JITCore() {} bool Arm64JITCore::IsInlineConstant(const IR::OrderedNodeWrapper& WNode, uint64_t* Value) const { if (WNode.IsImmediate()) { return false; } auto OpHeader = IR->GetOp(WNode); if (OpHeader->Op == IR::IROps::OP_INLINECONSTANT) { auto Op = OpHeader->C(); if (Value) { *Value = Op->Constant; } return true; } else { return false; } } bool Arm64JITCore::IsInlineEntrypointOffset(const IR::OrderedNodeWrapper& WNode, uint64_t* Value) const { if (WNode.IsImmediate()) { return false; } auto OpHeader = IR->GetOp(WNode); if (OpHeader->Op == IR::IROps::OP_INLINEENTRYPOINTOFFSET) { auto Op = OpHeader->C(); if (Value) { uint64_t Mask = ~0ULL; const auto Size = OpHeader->Size; if (Size == IR::OpSize::i32Bit) { Mask = 0xFFFF'FFFFULL; } *Value = (Entry + Op->Offset) & Mask; } return true; } else { return false; } } void Arm64JITCore::EmitTFCheck() { ARMEmitter::ForwardLabel l_TFUnset; ARMEmitter::ForwardLabel l_TFBlocked; // Note that this needs to be before the below suspend checks, as X86 checks this flag immediately after executing an instruction. ldrb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC])); (void)cbz(ARMEmitter::Size::i32Bit, TMP1, &l_TFUnset); // X86 semantically checks TF after executing each instruction, so e.g. setting a context with TF set will execute a single instruction // and then raise an exception. However on the FEX side this is simpler to implement by checking at the start of each instruction, handle this by having bit 1 being unset in the flag state indicate that TF is blocked for a single instruction. (void)tbz(TMP1, 1, &l_TFBlocked); // Block TF for a single instruction when the frontend jumps to a new context by unsetting bit 1. ldrb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC])); and_(ARMEmitter::Size::i32Bit, TMP1, TMP1, ~(1 << 1)); strb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC])); Core::CpuStateFrame::SynchronousFaultDataStruct State = { .FaultToTopAndGeneratedException = 1, .Signal = Core::FAULT_SIGTRAP, .TrapNo = X86State::X86_TRAPNO_DB, .si_code = 2, .err_code = 0, }; uint64_t Constant {}; memcpy(&Constant, &State, sizeof(State)); LoadConstant(ARMEmitter::Size::i64Bit, TMP1, Constant); str(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, SynchronousFaultData)); ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.GuestSignal_SIGTRAP)); br(TMP1); (void)Bind(&l_TFBlocked); // If TF was blocked for this instruction, unblock it for the next. LoadConstant(ARMEmitter::Size::i32Bit, TMP1, 0b11); strb(TMP1, STATE_PTR(CpuStateFrame, State.flags[X86State::RFLAG_TF_RAW_LOC])); (void)Bind(&l_TFUnset); } void Arm64JITCore::EmitSuspendInterruptCheck() { if (CTX->Config.NeedsPendingInterruptFaultCheck) { // Trigger a fault if there are any pending interrupts // Used only for suspend on WIN32 at the moment strb(ARMEmitter::XReg::zr, STATE, offsetof(FEXCore::Core::InternalThreadState, InterruptFaultPage) - offsetof(FEXCore::Core::InternalThreadState, BaseFrameState)); } #ifdef ARCHITECTURE_arm64ec static constexpr uint16_t SuspendMagic {0xCAFE}; ldr(TMP2.W(), STATE_PTR(CpuStateFrame, SuspendDoorbell)); ARMEmitter::ForwardLabel l_NoSuspend; (void)cbz(ARMEmitter::Size::i32Bit, TMP2, &l_NoSuspend); brk(SuspendMagic); (void)Bind(&l_NoSuspend); #endif } void Arm64JITCore::EmitEntryPoint(ARMEmitter::BackwardLabel& HeaderLabel, bool CheckTF) { // Get the address of the JITCodeHeader and store in to the core state. // Two instruction cost, each 1 cycle. adr_OrRestart(TMP1, &HeaderLabel); str(TMP1, STATE, offsetof(FEXCore::Core::CPUState, InlineJITBlockHeader)); if (CheckTF) { EmitTFCheck(); } if (SpillSlots) { const auto TotalSpillSlotsSize = SpillSlots * MaxSpillSlotSize; if (ARMEmitter::IsImmAddSub(TotalSpillSlotsSize)) { sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, TotalSpillSlotsSize); } else { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, TotalSpillSlotsSize); sub(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::rsp, ARMEmitter::XReg::rsp, TMP1, ARMEmitter::ExtendedType::LSL_64, 0); } } EmitSuspendInterruptCheck(); } CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, bool CheckTF) { FEXCORE_PROFILE_SCOPED("Arm64::CompileCode"); const auto PrevNumAllocations = Relocations.size(); this->Entry = Entry; this->DebugData = DebugData; this->IR = IR; RequiresFarARM64Jumps = false; SSANodeMultiplier = 24; // Prepare restart via long jump in case branch encoding fails. // This uses UncheckedLongJump since we don't implement std::longjmp in WoA setups switch (static_cast(FEXCore::UncheckedLongJump::SetJump(ThreadState->RestartJump))) { case RestartOptions::Control::Incoming: // Nothing break; case RestartOptions::Control::EnableFarARM64Jumps: RequiresFarARM64Jumps = true; break; case RestartOptions::Control::NeedsLargerJITSpace: // Get rid of the claimed buffer immediately, we can't fit in it at all. TempAllocator.UnclaimBuffer(); SSANodeMultiplier *= 2; break; default: LOGMAN_MSG_A_FMT("Unhandled Arm64 restart condition!"); } uint32_t SSACount = IR->GetSSACount(); JumpTargets.clear(); CallReturnTargets.clear(); PendingJumpThunks.clear(); JumpTargets.resize(IR->GetHeader()->BlockCount, {}); Relocations.resize(PrevNumAllocations, FEXCore::CPU::Relocation::Default()); // Discard any relocations generated from a previous attempt CodeData.EntryPoints.clear(); // Fairly excessive buffer range to make sure we don't overflow // One page baseline, plus SSANodeMultipler bytes, plus another page for guard page. const uint32_t DesiredBufferRange = AlignUp(FEXCore::Utils::FEX_PAGE_SIZE * 2 + SSACount * SSANodeMultiplier, FEXCore::Utils::FEX_PAGE_SIZE); // JIT output is first written to a temporary buffer and later relocated to the CodeBuffer. // This minimizes lock contention of CodeBufferWriteMutex. auto TempCodeBufferInfo = TempAllocator.ReownOrClaimBufferWithSize(DesiredBufferRange); auto TempCodeBuffer = TempCodeBufferInfo.Ptr; const uint32_t UsableBufferRange = TempCodeBufferInfo.Size - FEXCore::Utils::FEX_PAGE_SIZE; SetBuffer(TempCodeBuffer, UsableBufferRange); ThreadState->JITGuardPage = reinterpret_cast(TempCodeBuffer) + UsableBufferRange; ThreadState->JITGuardOverflowArgument = FEXCore::ToUnderlying(RestartOptions::Control::NeedsLargerJITSpace); CodeData.BlockBegin = GetCursorAddress(); // Put the code header at the start of the data block. ARMEmitter::BackwardLabel JITCodeHeaderLabel {}; (void)Bind(&JITCodeHeaderLabel); JITCodeHeader* CodeHeader = GetCursorAddress(); CursorIncrement(sizeof(JITCodeHeader)); auto CodeBegin = GetCursorAddress(); // AAPCS64 // r30 = LR // r29 = FP // r19..r28 = Callee saved // r18 = Platform Register (Matters if we target Windows or iOS) // r16..r17 = Inter-procedure scratch // r9..r15 = Temp // r8 = Indirect Result // r0...r7 = Parameter/Results // // FPRS: // v8..v15 = (lower 64bits) Callee saved // Our allocation: // X0 = ThreadState // X1 = MemBase // // X1-X3 = Temp // X4-r18 = RA SpillSlots = IR->SpillSlots(); PendingTargetLabel = nullptr; PendingCallReturnTargetLabel = nullptr; for (auto [BlockNode, BlockHeader] : IR->GetBlocks()) { using namespace FEXCore::IR; auto BlockIROp = BlockHeader->CW(); #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_THROW_A_FMT(BlockIROp->Header.Op == IR::OP_CODEBLOCK, "IR type failed to be a code block"); #endif auto BlockStartHostCode = GetCursorAddress(); { const auto Node = IR->GetID(BlockNode); const auto Target = &JumpTargets[BlockIROp->ID]; // if there's a pending branch, and it is not fall-through if (PendingTargetLabel && PendingTargetLabel != Target) { if (PendingTargetLabel->Backward.Location) { EmitSuspendInterruptCheck(); } b_OrRestart(PendingTargetLabel); PendingTargetLabel = nullptr; } if (BlockIROp->EntryPoint) { uint64_t BlockStartRIP = Entry + BlockIROp->GuestEntryOffset; const auto IsReturnTarget = CallReturnTargets.try_emplace(Node).first; if (PendingTargetLabel) { // If there is a fallthrough branch to this block, skip over the entrypoint code. b_OrRestart(Target); } else if (PendingCallReturnTargetLabel && PendingCallReturnTargetLabel != &IsReturnTarget->second) { // If we just emitted a call, but the block we're now emitting is not the return block so don't fallthrough. b_OrRestart(PendingCallReturnTargetLabel); } PendingCallReturnTargetLabel = nullptr; BindOrRestart(&IsReturnTarget->second); CodeData.EntryPoints.emplace(BlockStartRIP, GetCursorAddress()); DebugData->GuestOpcodes.push_back({BlockIROp->GuestEntryOffset, GetCursorAddress() - CodeData.BlockBegin}); EmitEntryPoint(JITCodeHeaderLabel, CheckTF); } if (PendingCallReturnTargetLabel) { // If there is still a pending call return target, then the block we're emitting is not the return block so don't fallthrough. b_OrRestart(PendingCallReturnTargetLabel); PendingCallReturnTargetLabel = nullptr; } PendingTargetLabel = nullptr; BindOrRestart(Target); } for (auto [CodeNode, IROp] : IR->GetCode(BlockNode)) { switch (IROp->Op) { #define REGISTER_OP(op, x) \ case FEXCore::IR::IROps::OP_##op: Op_##x(IROp, CodeNode); break #define IROP_DISPATCH_DISPATCH #include #undef REGISTER_OP default: Op_Unhandled(IROp, CodeNode); break; } } DebugData->Subblocks.push_back({static_cast(BlockStartHostCode - CodeData.BlockBegin), static_cast(GetCursorAddress() - BlockStartHostCode)}); } // Make sure last branch is generated. It certainly can't be eliminated here. if (PendingTargetLabel) { if (PendingTargetLabel->Backward.Location) { EmitSuspendInterruptCheck(); } b_OrRestart(PendingTargetLabel); } PendingTargetLabel = nullptr; ARMEmitter::ForwardLabel l_ExitLink; for (auto& PendingJumpThunk : PendingJumpThunks) { // Align as 64-bit atomics are used on the HostCode field. Align(8); ARMEmitter::ForwardLabel l_DoLink; uint64_t ThunkAddress = GetCursorAddress(); BindOrRestart(&PendingJumpThunk.Label); b_OrRestart(&l_DoLink); br(TMP1); BindOrRestart(&l_DoLink); ldr(TMP1, &l_ExitLink); blr(TMP1); // This is a ExitFunctionLinkData struct BindOrRestart(&l_ExitLink); dc64(0); // HostCode PlaceNamedSymbolLiteral(InsertGuestRIPLiteral(PendingJumpThunk.GuestRIP)); // GuestRIP dc64(PendingJumpThunk.CallerAddress - ThunkAddress); // CallerOffset } BindOrRestart(&l_ExitLink); PlaceNamedSymbolLiteral(InsertNamedSymbolLiteral(RelocNamedSymbolLiteral::NamedSymbol::SYMBOL_LITERAL_EXITFUNCTION_LINKER)); // CodeSize not including the header or tail data. const uint64_t CodeOnlySize = GetCursorAddress() - CodeBegin; // Add the JitCodeTail (written later) Align(alignof(JITCodeTail)); const auto JITBlockTailLocation = GetCursorAddress(); CodeHeader->OffsetToBlockTail = JITBlockTailLocation - CodeData.BlockBegin; JITCodeTail JITBlockTail { .RIP = Entry, .GuestSize = Size, .SpinLockFutex = 0, .SingleInst = SingleInst, }; // Entries that live after the JITCodeTail. // These entries correlate JIT code regions with guest RIP regions. // Using these entries FEX is able to reconstruct the guest RIP accurately when an instruction cause a signal fault. // Packed using two variable length integer entries to ensure the size isn't too large. // These smaller sizes means that each entry is relative to each other instead of absolute offset from the start of the JIT block. // When reconstructing the RIP, each entry must be walked linearly and accumulated with the previous entries. // This is a trade-off between compression inside the JIT code space and execution time when reconstruction the RIP. // RIP reconstruction when faulting is less likely so we are requiring the accumulation. // // struct { // // The Host PC offset from the previous entry. // FEXCore::Utils::vl64 HostPCOffset; // // How much to offset the RIP from the previous entry. // FEXCore::Utils::vl64 GuestRIPOffset; // }; const auto JITRIPEntriesBegin = JITBlockTailLocation + sizeof(JITBlockTail); auto JITRIPEntriesLocation = JITRIPEntriesBegin; { // Store the RIP entries. JITBlockTail.NumberOfRIPEntries = DebugData->GuestOpcodes.size(); JITBlockTail.OffsetToRIPEntries = JITRIPEntriesBegin - JITBlockTailLocation; uintptr_t CurrentRIPOffset = 0; uint64_t CurrentPCOffset = 0; for (size_t i = 0; i < DebugData->GuestOpcodes.size(); i++) { const auto& GuestOpcode = DebugData->GuestOpcodes[i]; int64_t HostPCOffset = GuestOpcode.HostEntryOffset - CurrentPCOffset; int64_t GuestRIPOffset = GuestOpcode.GuestEntryOffset - CurrentRIPOffset; JITRIPEntriesLocation += FEXCore::Utils::vl64pair::Encode(JITRIPEntriesLocation, HostPCOffset, GuestRIPOffset); CurrentPCOffset = GuestOpcode.HostEntryOffset; CurrentRIPOffset = GuestOpcode.GuestEntryOffset; } } SetCursorOffset(JITRIPEntriesLocation - CodeData.BlockBegin); Align(); CodeData.Size = GetCursorAddress() - CodeData.BlockBegin; // Finalize and write block tail data JITBlockTail.Size = CodeData.Size; { auto PrevCur = GetCursorOffset(); memcpy(JITBlockTailLocation, &JITBlockTail, sizeof(JITBlockTail)); SetCursorOffset(JITBlockTailLocation - CodeData.BlockBegin + offsetof(JITCodeTail, RIP)); PlaceNamedSymbolLiteral(InsertGuestRIPLiteral(JITBlockTail.RIP)); SetCursorOffset(PrevCur); } // Migrate the compile output from temporary storage to the actual CodeBuffer. // This can block progress in other compiling threads, so the duration of the lock should be as small as possible. { auto CodeBufferLock = std::unique_lock {CodeBuffers.CodeBufferWriteMutex}; // Query size of generated code const auto TempSize = GetCursorOffset(); // Bring CodeBuffer up to date { LOGMAN_THROW_A_FMT(CurrentCodeBuffer->LookupCache.get() == ThreadState->LookupCache->Shared, "INVARIANT VIOLATED: SharedLookupCache " "doesn't match up!\n"); if (auto Prev = CheckCodeBufferUpdate()) { Allocator::VirtualDontNeed(ThreadState->CallRetStackBase, FEXCore::Core::InternalThreadState::CALLRET_STACK_SIZE); auto lk = ThreadState->LookupCache->AcquireWriteLock(); ThreadState->LookupCache->ChangeGuestToHostMapping(*Prev, *CurrentCodeBuffer->LookupCache, lk); } // NOTE: 16-byte alignment of the new cursor offset must be preserved for block linking records SetBuffer(CurrentCodeBuffer->Ptr, CurrentCodeBuffer->AllocatedSize); SetCursorOffset(CodeBuffers.LatestOffset); Align16B(); if ((GetCursorOffset() + TempSize) > CurrentCodeBuffer->UsableSize()) { CTX->ClearCodeCache(ThreadState); } CodeBuffers.LatestOffset = GetCursorOffset(); } // Adjust host addresses const auto Delta = GetCursorAddress() - CodeData.BlockBegin; CodeData.BlockBegin += Delta; for (auto& EntryPoint : CodeData.EntryPoints) { EntryPoint.second += Delta; } CodeBegin += Delta; for (std::size_t Idx = PrevNumAllocations; Idx != Relocations.size(); ++Idx) { Relocations[Idx].Header.Offset += CodeBuffers.LatestOffset; } // Copy over CodeBuffer contents memcpy(GetCursorAddress(), TempCodeBuffer, TempSize); SetCursorOffset(CodeBuffers.LatestOffset + TempSize); CodeBuffers.LatestOffset = GetCursorOffset(); } TempAllocator.DelayedDisownBuffer(); ClearICache(CodeBegin, CodeOnlySize); #ifdef VIXL_DISASSEMBLER if (Disassemble() & FEXCore::Config::Disassemble::STATS) { auto HeaderOp = IR->GetHeader(); LOGMAN_THROW_A_FMT(HeaderOp->Header.Op == IR::OP_IRHEADER, "First op wasn't IRHeader"); LogMan::Msg::IFmt("RIP: 0x{:x}", Entry); LogMan::Msg::IFmt("Guest Code instructions: {}", HeaderOp->NumHostInstructions); LogMan::Msg::IFmt("Host Code instructions: {}", CodeOnlySize >> 2); LogMan::Msg::IFmt("Blow-up Amt: {}x", double(CodeOnlySize >> 2) / double(HeaderOp->NumHostInstructions)); } if (Disassemble() & FEXCore::Config::Disassemble::BLOCKS) { const auto DisasmBegin = reinterpret_cast(CodeBegin); const auto DisasmEnd = reinterpret_cast(CodeBegin + CodeOnlySize); LogMan::Msg::IFmt("Disassemble Begin"); for (auto PCToDecode = DisasmBegin; PCToDecode < DisasmEnd; PCToDecode += 4) { DisasmDecoder->Decode(PCToDecode); auto Output = Disasm->GetOutput(); LogMan::Msg::IFmt("{}", Output); } LogMan::Msg::IFmt("Disassemble End"); } #endif DebugData->HostCodeSize = CodeData.Size; DebugData->Relocations = &Relocations; this->IR = nullptr; return std::move(CodeData); } void Arm64JITCore::ResetStack() { if (SpillSlots == 0) { return; } const auto TotalSpillSlotsSize = SpillSlots * MaxSpillSlotSize; if (ARMEmitter::IsImmAddSub(TotalSpillSlotsSize)) { add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, TotalSpillSlotsSize); } else { // Too big to fit in a 12bit immediate LoadConstant(ARMEmitter::Size::i64Bit, TMP1, TotalSpillSlotsSize); add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::rsp, ARMEmitter::XReg::rsp, TMP1, ARMEmitter::ExtendedType::LSL_64, 0); } } fextl::unique_ptr CreateArm64JITCore(FEXCore::Context::ContextImpl* ctx, FEXCore::Core::InternalThreadState* Thread) { return fextl::make_unique(ctx, Thread); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/JITClass.h ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #pragma once #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/CPUBackend.h" #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/JIT/Relocations.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" #include "Interface/IR/RegisterAllocationData.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::Core { struct InternalThreadState; } namespace FEXCore::Context { struct ExitFunctionLinkData; } namespace FEXCore::IR { class RegisterAllocationPass; } namespace FEXCore::CPU { class Arm64JITCore final : public CPUBackend, public Arm64Emitter { public: explicit Arm64JITCore(FEXCore::Context::ContextImpl* ctx, FEXCore::Core::InternalThreadState* Thread); ~Arm64JITCore() override; [[nodiscard]] CPUBackend::CompiledCode CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, bool CheckTF) override; void ClearCache() override; void ClearRelocations() override { Relocations.clear(); } private: const bool HostSupportsSVE128 {}; const bool HostSupportsSVE256 {}; const bool HostSupportsAVX256 {}; const bool HostSupportsRPRES {}; const bool HostSupportsAFP {}; struct RestartOptions { enum class Control : uint64_t { Incoming = 0, EnableFarARM64Jumps = 1, NeedsLargerJITSpace = 2, }; }; // FEXCore makes assumptions in the JIT about certain conditions being true. // In the rare case when those assumptions are broken, FEX needs to safely restart the JIT. RestartOptions RestartControl {}; bool RequiresFarARM64Jumps {}; // Default to 6 instructions per SSA node. uint32_t SSANodeMultiplier {24}; ARMEmitter::BiDirectionalLabel* PendingTargetLabel {}; ARMEmitter::BiDirectionalLabel* PendingCallReturnTargetLabel {}; FEXCore::Context::ContextImpl* CTX {}; const FEXCore::IR::IRListView* IR {}; uint64_t Entry {}; CPUBackend::CompiledCode CodeData {}; fextl::vector JumpTargets; ARMEmitter::BiDirectionalLabel* JumpTarget(IR::OrderedNodeWrapper Node) { auto Block = IR->GetOp(Node); return &JumpTargets[Block->ID]; } fextl::map CallReturnTargets; struct PendingJumpThunk { uint64_t CallerAddress; uint64_t GuestRIP; ARMEmitter::ForwardLabel Label; }; fextl::vector PendingJumpThunks; Utils::PoolBufferWithTimedRetirement TempAllocator; static uint64_t ExitFunctionLink(FEXCore::Core::CpuStateFrame* Frame, FEXCore::Context::ExitFunctionLinkData* Record); [[nodiscard]] ARMEmitter::Register GetReg(IR::PhysicalRegister Reg) const { const auto RegClass = Reg.AsRegClass(); LOGMAN_THROW_A_FMT(RegClass == IR::RegClass::GPRFixed || RegClass == IR::RegClass::GPR, "Unexpected Class: {}", Reg.Class); if (RegClass == IR::RegClass::GPRFixed) { return StaticRegisters[Reg.Reg]; } else if (RegClass == IR::RegClass::GPR) { return GeneralRegisters[Reg.Reg]; } FEX_UNREACHABLE; } [[nodiscard]] ARMEmitter::Register GetReg(IR::Ref Node) const { return GetReg(IR::PhysicalRegister(Node)); } [[nodiscard]] ARMEmitter::Register GetReg(IR::OrderedNodeWrapper Wrap) const { return GetReg(IR::PhysicalRegister(Wrap)); } [[nodiscard]] ARMEmitter::VRegister GetVReg(IR::PhysicalRegister Reg) const { const auto RegClass = Reg.AsRegClass(); LOGMAN_THROW_A_FMT(RegClass == IR::RegClass::FPRFixed || RegClass == IR::RegClass::FPR, "Unexpected Class: {}", Reg.Class); if (RegClass == IR::RegClass::FPRFixed) { return StaticFPRegisters[Reg.Reg]; } else if (RegClass == IR::RegClass::FPR) { return GeneralFPRegisters[Reg.Reg]; } FEX_UNREACHABLE; } [[nodiscard]] ARMEmitter::VRegister GetVReg(IR::Ref Node) const { return GetVReg(IR::PhysicalRegister(Node)); } [[nodiscard]] ARMEmitter::VRegister GetVReg(IR::OrderedNodeWrapper Wrap) const { return GetVReg(IR::PhysicalRegister(Wrap)); } [[nodiscard]] static IR::RegClass GetRegClass(IR::Ref Node) { return IR::PhysicalRegister(Node).AsRegClass(); } [[nodiscard]] ARMEmitter::Register GetZeroableReg(IR::OrderedNodeWrapper Src) const { uint64_t Const; if (IsInlineConstant(Src, &Const)) { LOGMAN_THROW_A_FMT(Const == 0, "Only valid constant"); return ARMEmitter::Reg::zr; } else { return GetReg(Src); } } // Converts IR-base shift type to ARMEmitter shift type. // Will be a no-op, only a type conversion since the two definitions match. [[nodiscard]] static ARMEmitter::ShiftType ConvertIRShiftType(IR::ShiftType Shift) { return Shift == IR::ShiftType::LSL ? ARMEmitter::ShiftType::LSL : Shift == IR::ShiftType::LSR ? ARMEmitter::ShiftType::LSR : Shift == IR::ShiftType::ASR ? ARMEmitter::ShiftType::ASR : ARMEmitter::ShiftType::ROR; } [[nodiscard]] static ARMEmitter::Size ConvertSize(const IR::IROp_Header* Op) { return Op->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; } [[nodiscard]] static ARMEmitter::Size ConvertSize48(const IR::IROp_Header* Op) { LOGMAN_THROW_A_FMT(Op->Size == IR::OpSize::i32Bit || Op->Size == IR::OpSize::i64Bit, "Invalid size"); return ConvertSize(Op); } [[nodiscard]] static ARMEmitter::Size ConvertSize(IR::OpSize Size) { return Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; } [[nodiscard]] static ARMEmitter::SubRegSize ConvertSubRegSize16(IR::OpSize ElementSize) { LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, "Invalid size"); return ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i8Bit : ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : ElementSize == IR::OpSize::i64Bit ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i128Bit; } [[nodiscard]] static ARMEmitter::SubRegSize ConvertSubRegSize16(const IR::IROp_Header* Op) { return ConvertSubRegSize16(Op->ElementSize); } [[nodiscard]] static ARMEmitter::SubRegSize ConvertSubRegSize8(IR::OpSize ElementSize) { LOGMAN_THROW_A_FMT(ElementSize != IR::OpSize::i128Bit, "Invalid size"); return ConvertSubRegSize16(ElementSize); } [[nodiscard]] static ARMEmitter::SubRegSize ConvertSubRegSize8(const IR::IROp_Header* Op) { return ConvertSubRegSize8(Op->ElementSize); } [[nodiscard]] static ARMEmitter::SubRegSize ConvertSubRegSize4(const IR::IROp_Header* Op) { LOGMAN_THROW_A_FMT(Op->ElementSize != IR::OpSize::i64Bit, "Invalid size"); return ConvertSubRegSize8(Op); } [[nodiscard]] static ARMEmitter::SubRegSize ConvertSubRegSize248(const IR::IROp_Header* Op) { LOGMAN_THROW_A_FMT(Op->ElementSize != IR::OpSize::i8Bit, "Invalid size"); return ConvertSubRegSize8(Op); } [[nodiscard]] static ARMEmitter::VectorRegSizePair ConvertSubRegSizePair16(const IR::IROp_Header* Op) { return ARMEmitter::ToVectorSizePair(ConvertSubRegSize16(Op)); } [[nodiscard]] static ARMEmitter::VectorRegSizePair ConvertSubRegSizePair8(const IR::IROp_Header* Op) { LOGMAN_THROW_A_FMT(Op->ElementSize != IR::OpSize::i128Bit, "Invalid size"); return ConvertSubRegSizePair16(Op); } [[nodiscard]] static ARMEmitter::VectorRegSizePair ConvertSubRegSizePair248(const IR::IROp_Header* Op) { LOGMAN_THROW_A_FMT(Op->ElementSize != IR::OpSize::i8Bit, "Invalid size"); return ConvertSubRegSizePair8(Op); } [[nodiscard]] static ARMEmitter::Condition MapCC(IR::CondClass Cond) { switch (Cond) { case IR::CondClass::EQ: return ARMEmitter::Condition::CC_EQ; case IR::CondClass::NEQ: return ARMEmitter::Condition::CC_NE; case IR::CondClass::SGE: return ARMEmitter::Condition::CC_GE; case IR::CondClass::SLT: return ARMEmitter::Condition::CC_LT; case IR::CondClass::SGT: return ARMEmitter::Condition::CC_GT; case IR::CondClass::SLE: return ARMEmitter::Condition::CC_LE; case IR::CondClass::UGE: return ARMEmitter::Condition::CC_CS; case IR::CondClass::ULT: return ARMEmitter::Condition::CC_CC; case IR::CondClass::UGT: return ARMEmitter::Condition::CC_HI; case IR::CondClass::ULE: return ARMEmitter::Condition::CC_LS; case IR::CondClass::FLU: return ARMEmitter::Condition::CC_LT; case IR::CondClass::FGE: return ARMEmitter::Condition::CC_GE; case IR::CondClass::FLEU: return ARMEmitter::Condition::CC_LE; case IR::CondClass::FGT: return ARMEmitter::Condition::CC_GT; case IR::CondClass::FU: case IR::CondClass::VS: return ARMEmitter::Condition::CC_VS; case IR::CondClass::FNU: case IR::CondClass::VC: return ARMEmitter::Condition::CC_VC; case IR::CondClass::MI: return ARMEmitter::Condition::CC_MI; case IR::CondClass::PL: return ARMEmitter::Condition::CC_PL; default: LOGMAN_MSG_A_FMT("Unsupported compare type"); return ARMEmitter::Condition::CC_NV; } } [[nodiscard]] static bool IsFPR(IR::RegClass Class) { return Class == IR::RegClass::FPR || Class == IR::RegClass::FPRFixed; } [[nodiscard]] static bool IsGPR(IR::RegClass Class) { return Class == IR::RegClass::GPR || Class == IR::RegClass::GPRFixed; } [[nodiscard]] static bool IsGPR(IR::Ref Node) { return IsGPR(GetRegClass(Node)); } [[nodiscard]] static bool IsFPR(IR::Ref Node) { return IsFPR(GetRegClass(Node)); } [[nodiscard]] static bool IsGPR(IR::OrderedNodeWrapper Wrap) { return IsGPR(IR::PhysicalRegister(Wrap).AsRegClass()); } [[nodiscard]] static bool IsFPR(IR::OrderedNodeWrapper Wrap) { return IsFPR(IR::PhysicalRegister(Wrap).AsRegClass()); } [[nodiscard]] ARMEmitter::ExtendedMemOperand GenerateMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale); [[nodiscard]] ARMEmitter::Register ApplyMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, ARMEmitter::Register Tmp, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale); // NOTE: Will use TMP1 as a way to encode immediates that happen to fall outside // the limits of the scalar plus immediate variant of SVE load/stores. // // TMP1 is safe to use again once this memory operand is used with its // equivalent loads or stores that this was called for. [[nodiscard]] ARMEmitter::SVEMemOperand GenerateSVEMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale); [[nodiscard]] bool IsInlineConstant(const IR::OrderedNodeWrapper& Node, uint64_t* Value = nullptr) const; [[nodiscard]] bool IsInlineEntrypointOffset(const IR::OrderedNodeWrapper& WNode, uint64_t* Value) const; struct LiveRange { uint32_t Begin; uint32_t End; }; void EmitLinkedBranch(uint64_t GuestRIP, bool Call) { PendingJumpThunks.push_back({GetCursorAddress(), GuestRIP, {}}); auto& Thunk = PendingJumpThunks.back(); BindOrRestart(&Thunk.Label); if (Call) { bl_OrRestart(&Thunk.Label); } else { b_OrRestart(&Thunk.Label); } } // Restart helpers template void bl_OrRestart(T* Label) { if (bl(Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } // We can support this but currently unnecessary. ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void b_OrRestart(T* Label) { if (b(Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } // We can support this but currently unnecessary. ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void b_OrRestart(ARMEmitter::Condition Cond, T* Label) { if (RequiresFarARM64Jumps) { ARMEmitter::ForwardLabel Skip {}; // Wrap a manual Cond check around an unconditional branch; this can encode larger offsets (void)b(InvertCondition(Cond), &Skip); if (b(Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); } (void)Bind(&Skip); return; } if (b(Cond, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void cbz_OrRestart(ARMEmitter::Size s, ARMEmitter::Register rt, T* Label) { if (RequiresFarARM64Jumps) { ARMEmitter::ForwardLabel Skip {}; // Wrap a manual Cond check around an unconditional branch; this can encode larger offsets (void)cbnz(s, rt, &Skip); if (b(Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); } (void)Bind(&Skip); return; } if (cbz(s, rt, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void cbnz_OrRestart(ARMEmitter::Size s, ARMEmitter::Register rt, T* Label) { if (RequiresFarARM64Jumps) { ARMEmitter::ForwardLabel Skip {}; // Wrap a manual Cond check around an unconditional branch; this can encode larger offsets (void)cbz(s, rt, &Skip); if (b(Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); } (void)Bind(&Skip); return; } if (cbnz(s, rt, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void tbz_OrRestart(ARMEmitter::Register rt, uint32_t Bit, T* Label) { if (RequiresFarARM64Jumps) { ARMEmitter::ForwardLabel Skip {}; // Wrap a manual Cond check around an unconditional branch; this can encode larger offsets (void)tbnz(rt, Bit, &Skip); if (b(Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); } (void)Bind(&Skip); return; } if (tbz(rt, Bit, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void tbnz_OrRestart(ARMEmitter::Register rt, uint32_t Bit, T* Label) { if (RequiresFarARM64Jumps) { ARMEmitter::ForwardLabel Skip {}; // Wrap a manual Cond check around an unconditional branch; this can encode larger offsets (void)tbz(rt, Bit, &Skip); if (b(Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Tried to branch larger than 128MB away!"); } (void)Bind(&Skip); return; } if (tbnz(rt, Bit, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void adr_OrRestart(ARMEmitter::Register rd, T* Label) { if (RequiresFarARM64Jumps) { if (LongAddressGen(rd, Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Unable to encode long ADR."); } return; } if (adr(rd, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void adrp_OrRestart(ARMEmitter::Register rd, T* Label) { if (RequiresFarARM64Jumps) { if (LongAddressGen(rd, Label) == ARMEmitter::BranchEncodeSucceeded::Failure) { ERROR_AND_DIE_FMT("Unable to encode long ADRP."); } return; } if (adrp(rd, Label) == ARMEmitter::BranchEncodeSucceeded::Success) { return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } template void BindOrRestart(T* Label) { if (Bind(Label)) { return; } if (RequiresFarARM64Jumps) { // This should have been caught before this point. ERROR_AND_DIE_FMT("Unhandled long bind"); return; } FEXCore::UncheckedLongJump::LongJump(ThreadState->RestartJump, FEXCore::ToUnderlying(RestartOptions::Control::EnableFarARM64Jumps)); } // This is purely a debugging aid for developers to see if they are in JIT code space when inspecting raw memory void EmitDetectionString(); IR::RegisterAllocationPass* RAPass {}; FEXCore::Core::DebugData* DebugData {}; void ResetStack(); /** * @name Relocations * @{ */ /** * @brief A literal pair relocation object for named symbol literals */ struct NamedSymbolLiteralPair { ARMEmitter::ForwardLabel Loc; uint64_t Lit; Relocation MoveABI {}; }; /** * @brief Inserts a thunk relocation * * @param Reg - The GPR to move the thunk handler in to * @param Sum - The hash of the thunk */ void InsertNamedThunkRelocation(ARMEmitter::Register Reg, const IR::SHA256Sum& Sum); /** * @brief Inserts a guest GPR move relocation * * @param Reg - The GPR to move the guest RIP in to * @param Constant - The guest RIP that will be relocated */ void InsertGuestRIPMove(ARMEmitter::Register Reg, uint64_t Constant); /** * @brief Inserts a named symbol as a literal in memory * * Need to use `PlaceNamedSymbolLiteral` with the return value to place the literal in the desired location * * @param Op The named symbol to place * * @return A temporary `NamedSymbolLiteralPair` */ NamedSymbolLiteralPair InsertNamedSymbolLiteral(FEXCore::CPU::RelocNamedSymbolLiteral::NamedSymbol Op); /** * @brief Inserts a relocation for a constant value relative to the guest entrypoint * * @param Reg - The GPR to move the guest RIP in to * @param Constant - The guest RIP that will be relocated */ NamedSymbolLiteralPair InsertGuestRIPLiteral(uint64_t GuestRIP); /** * @brief Place the named symbol literal relocation in memory * * @param Lit - Which literal to place */ void PlaceNamedSymbolLiteral(NamedSymbolLiteralPair Lit); fextl::vector Relocations; /** * Returns any relocations generated since the last call to TakeRelocations. * * GuestBaseAddress must match the base virtual address to which the * input x86 binary is mapped. */ fextl::vector TakeRelocations(uint64_t GuestBaseAddress) override; /** @} */ uint32_t SpillSlots {}; using OpType = void (Arm64JITCore::*)(const IR::IROp_Header* IROp, IR::Ref Node); using ScalarFMAOpCaller = std::function; void VFScalarFMAOperation(IR::OpSize OpSize, IR::OpSize ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Upper, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2, ARMEmitter::VRegister Addend); using ScalarBinaryOpCaller = std::function; void VFScalarOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2); using ScalarUnaryOpCaller = std::function SrcVar)>; void VFScalarUnaryOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, std::variant Vector2); void Emulate128BitGather(IR::OpSize Size, IR::OpSize ElementSize, ARMEmitter::VRegister Dst, ARMEmitter::VRegister IncomingDst, std::optional BaseAddr, ARMEmitter::VRegister VectorIndexLow, std::optional VectorIndexHigh, ARMEmitter::VRegister MaskReg, IR::OpSize VectorIndexSize, size_t DataElementOffsetStart, size_t IndexElementOffsetStart, uint8_t OffsetScale, IR::OpSize AddrSize); void EmitTFCheck(); void EmitSuspendInterruptCheck(); void EmitEntryPoint(ARMEmitter::BackwardLabel& HeaderLabel, bool CheckTF); #define DEF_OP(x) void Op_##x(IR::IROp_Header const* IROp, IR::Ref Node) ///< Unhandled handler DEF_OP(Unhandled); ///< No-op Handler DEF_OP(NoOp); #define IROP_DISPATCH_DEFS #include #undef DEF_OP }; #define DEF_OP(x) void Arm64JITCore::Op_##x(IR::IROp_Header const* IROp, IR::Ref Node) [[nodiscard]] fextl::unique_ptr CreateArm64JITCore(FEXCore::Context::ContextImpl* ctx, FEXCore::Core::InternalThreadState* Thread); } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "FEXCore/Core/X86Enums.h" #include "FEXCore/Utils/LogManager.h" #include "Interface/Context/Context.h" #include "Interface/Core/ArchHelpers/Arm64Emitter.h" #include "Interface/Core/CPUID.h" #include "Interface/Core/JIT/JITClass.h" #include "Interface/IR/RegisterAllocationData.h" #include #include namespace FEXCore::CPU { DEF_OP(LoadContext) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; if (Op->Class == IR::RegClass::GPR) { auto Dst = GetReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, STATE, Op->Offset); break; case IR::OpSize::i16Bit: ldrh(Dst, STATE, Op->Offset); break; case IR::OpSize::i32Bit: ldr(Dst.W(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: ldr(Dst.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContext size: {}", OpSize); break; } } else { auto Dst = GetVReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, STATE, Op->Offset); break; case IR::OpSize::i16Bit: ldrh(Dst, STATE, Op->Offset); break; case IR::OpSize::i32Bit: ldr(Dst.S(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: ldr(Dst.D(), STATE, Op->Offset); break; case IR::OpSize::i128Bit: ldr(Dst.Q(), STATE, Op->Offset); break; case IR::OpSize::i256Bit: mov(TMP1, Op->Offset); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), STATE, TMP1); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContext size: {}", OpSize); break; } } } DEF_OP(LoadContextPair) { const auto Op = IROp->C(); if (Op->Class == IR::RegClass::GPR) { const auto Dst1 = GetReg(Op->OutValue1); const auto Dst2 = GetReg(Op->OutValue2); switch (IROp->Size) { case IR::OpSize::i32Bit: ldp(Dst1.W(), Dst2.W(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: ldp(Dst1.X(), Dst2.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } else { const auto Dst1 = GetVReg(Op->OutValue1); const auto Dst2 = GetVReg(Op->OutValue2); switch (IROp->Size) { case IR::OpSize::i32Bit: ldp(Dst1.S(), Dst2.S(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: ldp(Dst1.D(), Dst2.D(), STATE, Op->Offset); break; case IR::OpSize::i128Bit: ldp(Dst1.Q(), Dst2.Q(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } } DEF_OP(StoreContext) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; if (Op->Class == IR::RegClass::GPR) { auto Src = GetZeroableReg(Op->Value); switch (OpSize) { case IR::OpSize::i8Bit: strb(Src, STATE, Op->Offset); break; case IR::OpSize::i16Bit: strh(Src, STATE, Op->Offset); break; case IR::OpSize::i32Bit: str(Src.W(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: str(Src.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break; } } else { const auto Src = GetVReg(Op->Value); switch (OpSize) { case IR::OpSize::i8Bit: strb(Src, STATE, Op->Offset); break; case IR::OpSize::i16Bit: strh(Src, STATE, Op->Offset); break; case IR::OpSize::i32Bit: str(Src.S(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: str(Src.D(), STATE, Op->Offset); break; case IR::OpSize::i128Bit: str(Src.Q(), STATE, Op->Offset); break; case IR::OpSize::i256Bit: mov(TMP1, Op->Offset); st1b(Src.Z(), PRED_TMP_32B, STATE, TMP1); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break; } } } DEF_OP(StoreContextPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; if (Op->Class == IR::RegClass::GPR) { auto Src1 = GetZeroableReg(Op->Value1); auto Src2 = GetZeroableReg(Op->Value2); switch (OpSize) { case IR::OpSize::i32Bit: stp(Src1.W(), Src2.W(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: stp(Src1.X(), Src2.X(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContext size: {}", OpSize); break; } } else { const auto Src1 = GetVReg(Op->Value1); const auto Src2 = GetVReg(Op->Value2); switch (OpSize) { case IR::OpSize::i32Bit: stp(Src1.S(), Src2.S(), STATE, Op->Offset); break; case IR::OpSize::i64Bit: stp(Src1.D(), Src2.D(), STATE, Op->Offset); break; case IR::OpSize::i128Bit: stp(Src1.Q(), Src2.Q(), STATE, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContextPair size: {}", OpSize); break; } } } DEF_OP(LoadRegister) { const auto Op = IROp->C(); if (Op->Class == IR::RegClass::GPR) { LOGMAN_THROW_A_FMT(Op->Reg < StaticRegisters.size(), "out of range reg"); mov(GetReg(Node).X(), StaticRegisters[Op->Reg].X()); } else if (Op->Class == IR::RegClass::FPR) { const auto regSize = HostSupportsAVX256 ? IR::OpSize::i256Bit : IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(Op->Reg < StaticFPRegisters.size(), "out of range reg"); LOGMAN_THROW_A_FMT(IROp->Size == regSize, "expected sized"); const auto guest = StaticFPRegisters[Op->Reg]; const auto host = GetVReg(Node); if (HostSupportsAVX256) { mov(ARMEmitter::SubRegSize::i64Bit, host.Z(), PRED_TMP_32B.Merging(), guest.Z()); } else { mov(host.Q(), guest.Q()); } } else { LOGMAN_THROW_A_FMT(false, "Unhandled Op->Class {}", Op->Class); } } DEF_OP(LoadPF) { const auto reg = StaticRegisters[StaticRegisters.size() - 2]; if (GetReg(Node).Idx() != reg.Idx()) { mov(GetReg(Node).X(), reg.X()); } } DEF_OP(LoadAF) { const auto reg = StaticRegisters[StaticRegisters.size() - 1]; if (GetReg(Node).Idx() != reg.Idx()) { mov(GetReg(Node).X(), reg.X()); } } DEF_OP(StoreRegister) { const auto Op = IROp->C(); const auto Reg = IR::PhysicalRegister(Node); const auto RegClass = Reg.AsRegClass(); if (RegClass == IR::RegClass::GPRFixed) { // Always use 64-bit, it's faster. Upper bits ignored for 32-bit mode. mov(ARMEmitter::Size::i64Bit, GetReg(Reg), GetReg(Op->Value)); } else if (RegClass == IR::RegClass::FPRFixed) { const auto regSize = HostSupportsAVX256 ? IR::OpSize::i256Bit : IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(IROp->Size == regSize, "expected sized"); const auto guest = GetVReg(Reg); const auto host = GetVReg(Op->Value); if (HostSupportsAVX256) { mov(ARMEmitter::SubRegSize::i64Bit, guest.Z(), PRED_TMP_32B.Merging(), host.Z()); } else { mov(guest.Q(), host.Q()); } } else { LOGMAN_THROW_A_FMT(false, "Unhandled Op->Class {}", RegClass); } } DEF_OP(StorePF) { const auto Op = IROp->C(); const auto reg = StaticRegisters[StaticRegisters.size() - 2]; const auto Src = GetReg(Op->Value); if (Src.Idx() != reg.Idx()) { // Always use 64-bit, it's faster. Upper bits ignored for 32-bit mode. mov(ARMEmitter::Size::i64Bit, reg, Src); } } DEF_OP(StoreAF) { const auto Op = IROp->C(); const auto reg = StaticRegisters[StaticRegisters.size() - 1]; const auto Src = GetReg(Op->Value); if (Src.Idx() != reg.Idx()) { // Always use 64-bit, it's faster. Upper bits ignored for 32-bit mode. mov(ARMEmitter::Size::i64Bit, reg, Src); } } DEF_OP(LoadContextIndexed) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Index = GetReg(Op->Index); if (Op->Class == IR::RegClass::GPR) { switch (Op->Stride) { case 1: case 2: case 4: case 8: { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); const auto Dst = GetReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, TMP1, Op->BaseOffset); break; case IR::OpSize::i16Bit: ldrh(Dst, TMP1, Op->BaseOffset); break; case IR::OpSize::i32Bit: ldr(Dst.W(), TMP1, Op->BaseOffset); break; case IR::OpSize::i64Bit: ldr(Dst.X(), TMP1, Op->BaseOffset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContextIndexed size: {}", OpSize); break; } break; } case 16: LOGMAN_MSG_A_FMT("Invalid Class load of size 16"); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContextIndexed stride: {}", Op->Stride); break; } } else { switch (Op->Stride) { case 1: case 2: case 4: case 8: case 16: case 32: { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); const auto Dst = GetVReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, TMP1, Op->BaseOffset); break; case IR::OpSize::i16Bit: ldrh(Dst, TMP1, Op->BaseOffset); break; case IR::OpSize::i32Bit: ldr(Dst.S(), TMP1, Op->BaseOffset); break; case IR::OpSize::i64Bit: ldr(Dst.D(), TMP1, Op->BaseOffset); break; case IR::OpSize::i128Bit: if (Op->BaseOffset % 16 == 0) { ldr(Dst.Q(), TMP1, Op->BaseOffset); } else { add(ARMEmitter::Size::i64Bit, TMP1, TMP1, Op->BaseOffset); ldur(Dst.Q(), TMP1, Op->BaseOffset); } break; case IR::OpSize::i256Bit: mov(TMP2, Op->BaseOffset); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), TMP1, TMP2); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadContextIndexed size: {}", OpSize); break; } break; } default: LOGMAN_MSG_A_FMT("Unhandled LoadContextIndexed stride: {}", Op->Stride); break; } } } DEF_OP(StoreContextIndexed) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Index = GetReg(Op->Index); if (Op->Class == IR::RegClass::GPR) { const auto Value = GetReg(Op->Value); switch (Op->Stride) { case 1: case 2: case 4: case 8: { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); switch (OpSize) { case IR::OpSize::i8Bit: strb(Value, TMP1, Op->BaseOffset); break; case IR::OpSize::i16Bit: strh(Value, TMP1, Op->BaseOffset); break; case IR::OpSize::i32Bit: str(Value.W(), TMP1, Op->BaseOffset); break; case IR::OpSize::i64Bit: str(Value.X(), TMP1, Op->BaseOffset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContextIndexed size: {}", OpSize); break; } break; } case 16: LOGMAN_MSG_A_FMT("Invalid Class store of size 16"); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContextIndexed stride: {}", Op->Stride); break; } } else { const auto Value = GetVReg(Op->Value); switch (Op->Stride) { case 1: case 2: case 4: case 8: case 16: case 32: { add(ARMEmitter::Size::i64Bit, TMP1, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); switch (OpSize) { case IR::OpSize::i8Bit: strb(Value, TMP1, Op->BaseOffset); break; case IR::OpSize::i16Bit: strh(Value, TMP1, Op->BaseOffset); break; case IR::OpSize::i32Bit: str(Value.S(), TMP1, Op->BaseOffset); break; case IR::OpSize::i64Bit: str(Value.D(), TMP1, Op->BaseOffset); break; case IR::OpSize::i128Bit: if (Op->BaseOffset % 16 == 0) { str(Value.Q(), TMP1, Op->BaseOffset); } else { add(ARMEmitter::Size::i64Bit, TMP1, TMP1, Op->BaseOffset); stur(Value.Q(), TMP1, Op->BaseOffset); } break; case IR::OpSize::i256Bit: mov(TMP2, Op->BaseOffset); st1b(Value.Z(), PRED_TMP_32B, TMP1, TMP2); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreContextIndexed size: {}", OpSize); break; } break; } default: LOGMAN_MSG_A_FMT("Unhandled StoreContextIndexed stride: {}", Op->Stride); break; } } } DEF_OP(FormContextAddress) { const auto Op = IROp->C(); const auto Index = GetReg(Op->Index); const auto Dst = GetReg(Node); switch (Op->Stride) { case 1: case 2: case 4: case 8: case 16: case 32: { add(ARMEmitter::Size::i64Bit, Dst, STATE, Index, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Op->Stride)); break; } default: LOGMAN_MSG_A_FMT("Unhandled FormContextAddress stride: {}", Op->Stride); break; } } DEF_OP(SpillRegister) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const uint32_t SlotOffset = Op->Slot * MaxSpillSlotSize; if (Op->Class == IR::RegClass::GPR) { const auto Src = GetReg(Op->Value); switch (OpSize) { case IR::OpSize::i8Bit: { if (SlotOffset > LSByteMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); strb(Src, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { strb(Src, ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i16Bit: { if (SlotOffset > LSHalfMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); strh(Src, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { strh(Src, ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.W(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { str(Src.W(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.X(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { str(Src.X(), ARMEmitter::Reg::rsp, SlotOffset); } break; } default: LOGMAN_MSG_A_FMT("Unhandled SpillRegister size: {}", OpSize); break; } } else if (Op->Class == FEXCore::IR::RegClass::FPR) { const auto Src = GetVReg(Op->Value); switch (OpSize) { case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.S(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { str(Src.S(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.D(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { str(Src.D(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i128Bit: { if (SlotOffset > LSQWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); str(Src.Q(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { str(Src.Q(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i256Bit: { mov(TMP3, SlotOffset); st1b(Src.Z(), PRED_TMP_32B, ARMEmitter::Reg::rsp, TMP3); break; } default: LOGMAN_MSG_A_FMT("Unhandled SpillRegister size: {}", OpSize); break; } } else { LOGMAN_MSG_A_FMT("Unhandled SpillRegister class: {}", Op->Class); } } DEF_OP(FillRegister) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const uint32_t SlotOffset = Op->Slot * MaxSpillSlotSize; if (Op->Class == IR::RegClass::GPR) { const auto Dst = GetReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: { if (SlotOffset > LSByteMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldrb(Dst, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldrb(Dst, ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i16Bit: { if (SlotOffset > LSHalfMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldrh(Dst, ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldrh(Dst, ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.W(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldr(Dst.W(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.X(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldr(Dst.X(), ARMEmitter::Reg::rsp, SlotOffset); } break; } default: LOGMAN_MSG_A_FMT("Unhandled FillRegister size: {}", OpSize); break; } } else if (Op->Class == FEXCore::IR::RegClass::FPR) { const auto Dst = GetVReg(Node); switch (OpSize) { case IR::OpSize::i32Bit: { if (SlotOffset > LSWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.S(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldr(Dst.S(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i64Bit: { if (SlotOffset > LSDWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.D(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldr(Dst.D(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i128Bit: { if (SlotOffset > LSQWordMaxUnsignedOffset) { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, SlotOffset); ldr(Dst.Q(), ARMEmitter::Reg::rsp, TMP1.R(), ARMEmitter::ExtendedType::LSL_64, 0); } else { ldr(Dst.Q(), ARMEmitter::Reg::rsp, SlotOffset); } break; } case IR::OpSize::i256Bit: { mov(TMP3, SlotOffset); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), ARMEmitter::Reg::rsp, TMP3); break; } default: LOGMAN_MSG_A_FMT("Unhandled FillRegister size: {}", OpSize); break; } } else { LOGMAN_MSG_A_FMT("Unhandled FillRegister class: {}", Op->Class); } } DEF_OP(LoadNZCV) { auto Dst = GetReg(Node); mrs(Dst, ARMEmitter::SystemRegister::NZCV); } DEF_OP(StoreNZCV) { auto Op = IROp->C(); msr(ARMEmitter::SystemRegister::NZCV, GetReg(Op->Value)); } DEF_OP(LoadDF) { auto Dst = GetReg(Node); auto Flag = X86State::RFLAG_DF_RAW_LOC; // DF needs sign extension to turn 0x1/0xFF into 1/-1 ldrsb(Dst.X(), STATE, ARRAY_OFFSETOF(FEXCore::Core::CPUState, flags, Flag)); } DEF_OP(ContextClear) { auto Op = IROp->C(); if (CTX->HostFeatures.SupportsCLZERO) { // We can use CLZero directly when hardware supports it. // Provides a fairly generous speed-up on Ampere1A hardware. // TODO: When FEAT_MOPS hardware ships, test memset using MOPS. for (size_t i = 0; i < Op->Size; i += 64) { add(ARMEmitter::Size::i64Bit, TMP1, STATE.R(), Op->Offset + i); dc(ARMEmitter::DataCacheOperation::ZVA, TMP1); } } else { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); for (size_t i = 0; i < Op->Size; i += 32) { stp(VTMP1.Q(), VTMP1.Q(), STATE.R(), Op->Offset + i); } } } ARMEmitter::ExtendedMemOperand Arm64JITCore::GenerateMemOperand( IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale) { if (Offset.IsInvalid()) { return ARMEmitter::ExtendedMemOperand(Base.X(), ARMEmitter::IndexType::OFFSET, 0); } else { if (OffsetScale != 1 && OffsetScale != IR::OpSizeToSize(AccessSize)) { LOGMAN_MSG_A_FMT("Unhandled GenerateMemOperand OffsetScale: {}", OffsetScale); } uint64_t Const; if (IsInlineConstant(Offset, &Const)) { return ARMEmitter::ExtendedMemOperand(Base.X(), ARMEmitter::IndexType::OFFSET, Const); } else { auto RegOffset = GetReg(Offset); switch (OffsetType) { case IR::MemOffsetType::SXTX: return ARMEmitter::ExtendedMemOperand(Base.X(), RegOffset.X(), ARMEmitter::ExtendedType::SXTX, FEXCore::ilog2(OffsetScale)); case IR::MemOffsetType::UXTW: return ARMEmitter::ExtendedMemOperand(Base.X(), RegOffset.X(), ARMEmitter::ExtendedType::UXTW, FEXCore::ilog2(OffsetScale)); case IR::MemOffsetType::SXTW: return ARMEmitter::ExtendedMemOperand(Base.X(), RegOffset.X(), ARMEmitter::ExtendedType::SXTW, FEXCore::ilog2(OffsetScale)); default: LOGMAN_MSG_A_FMT("Unhandled GenerateMemOperand OffsetType: {}", OffsetType); break; } } } FEX_UNREACHABLE; } ARMEmitter::Register Arm64JITCore::ApplyMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, ARMEmitter::Register Tmp, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, uint8_t OffsetScale) { if (Offset.IsInvalid()) { return Base; } if (OffsetScale != 1 && OffsetScale != IR::OpSizeToSize(AccessSize)) { LOGMAN_MSG_A_FMT("Unhandled OffsetScale: {}", OffsetScale); } uint64_t Const; if (IsInlineConstant(Offset, &Const)) { if (Const == 0) { return Base; } LoadConstant(ARMEmitter::Size::i64Bit, Tmp, Const); add(ARMEmitter::Size::i64Bit, Tmp, Base, Tmp, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(OffsetScale)); } else { auto RegOffset = GetReg(Offset); switch (OffsetType) { case IR::MemOffsetType::SXTX: add(ARMEmitter::Size::i64Bit, Tmp, Base, RegOffset, ARMEmitter::ExtendedType::SXTX, FEXCore::ilog2(OffsetScale)); break; case IR::MemOffsetType::UXTW: add(ARMEmitter::Size::i64Bit, Tmp, Base, RegOffset, ARMEmitter::ExtendedType::UXTW, FEXCore::ilog2(OffsetScale)); break; case IR::MemOffsetType::SXTW: add(ARMEmitter::Size::i64Bit, Tmp, Base, RegOffset, ARMEmitter::ExtendedType::SXTW, FEXCore::ilog2(OffsetScale)); break; default: LOGMAN_MSG_A_FMT("Unhandled OffsetType: {}", OffsetType); break; } } return Tmp; } ARMEmitter::SVEMemOperand Arm64JITCore::GenerateSVEMemOperand(IR::OpSize AccessSize, ARMEmitter::Register Base, IR::OrderedNodeWrapper Offset, IR::MemOffsetType OffsetType, [[maybe_unused]] uint8_t OffsetScale) { if (Offset.IsInvalid()) { return ARMEmitter::SVEMemOperand(Base.X(), 0); } uint64_t Const {}; if (IsInlineConstant(Offset, &Const)) { if (Const == 0) { return ARMEmitter::SVEMemOperand(Base.X(), 0); } const auto SignedConst = static_cast(Const); const auto SignedSVESize = static_cast(HostSupportsSVE256 ? Core::CPUState::XMM_AVX_REG_SIZE : Core::CPUState::XMM_SSE_REG_SIZE); const auto IsCleanlyDivisible = (SignedConst % SignedSVESize) == 0; const auto Index = SignedConst / SignedSVESize; // SVE's immediate variants of load stores are quite limited in terms // of immediate range. They also operate on a by-vector-length basis. // // e.g. On a 256-bit SVE capable system: // // LD1B Dst.B, Predicate/Z, [Reg, #1, MUL VL] // // Will add 32 to the base register as the offset // // So if we have a constant that cleanly lies along a 256-bit offset // and is also within the limitations of the immediate of -8 to 7 // then we can encode it as an immediate offset. // if (IsCleanlyDivisible && Index >= -8 && Index <= 7) { return ARMEmitter::SVEMemOperand(Base.X(), static_cast(Index)); } // If we can't do that for whatever reason, then unfortunately, we need // to move it over to a temporary to use as an offset. mov(TMP1, Const); return ARMEmitter::SVEMemOperand(Base.X(), TMP1); } // Otherwise handle it like normal. // Note that we do nothing with the offset type and offset scale, // since SVE loads and stores don't have the ability to perform an // optional extension or shift as part of their behavior. LOGMAN_THROW_A_FMT(OffsetType == IR::MemOffsetType::SXTX, "Currently only the default offset type (SXTX) is supported."); const auto RegOffset = GetReg(Offset); return ARMEmitter::SVEMemOperand(Base.X(), RegOffset.X()); } DEF_OP(LoadMem) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto MemReg = GetReg(Op->Addr); const auto MemSrc = GenerateMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); if (Op->Class == IR::RegClass::GPR) { const auto Dst = GetReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, MemSrc); break; case IR::OpSize::i16Bit: ldrh(Dst, MemSrc); break; case IR::OpSize::i32Bit: ldr(Dst.W(), MemSrc); break; case IR::OpSize::i64Bit: ldr(Dst.X(), MemSrc); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMem size: {}", OpSize); break; } } else { const auto Dst = GetVReg(Node); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, MemSrc); break; case IR::OpSize::i16Bit: ldrh(Dst, MemSrc); break; case IR::OpSize::i32Bit: ldr(Dst.S(), MemSrc); break; case IR::OpSize::i64Bit: ldr(Dst.D(), MemSrc); break; case IR::OpSize::i128Bit: ldr(Dst.Q(), MemSrc); break; case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Operand = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), Operand); break; } default: LOGMAN_MSG_A_FMT("Unhandled LoadMem size: {}", OpSize); break; } } } DEF_OP(LoadMemPair) { const auto Op = IROp->C(); const auto Addr = GetReg(Op->Addr); if (Op->Class == IR::RegClass::GPR) { const auto Dst1 = GetReg(Op->OutValue1); const auto Dst2 = GetReg(Op->OutValue2); switch (IROp->Size) { case IR::OpSize::i32Bit: ldp(Dst1.W(), Dst2.W(), Addr, Op->Offset); break; case IR::OpSize::i64Bit: ldp(Dst1.X(), Dst2.X(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } else { const auto Dst1 = GetVReg(Op->OutValue1); const auto Dst2 = GetVReg(Op->OutValue2); switch (IROp->Size) { case IR::OpSize::i32Bit: ldp(Dst1.S(), Dst2.S(), Addr, Op->Offset); break; case IR::OpSize::i64Bit: ldp(Dst1.D(), Dst2.D(), Addr, Op->Offset); break; case IR::OpSize::i128Bit: ldp(Dst1.Q(), Dst2.Q(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemPair size: {}", IROp->Size); break; } } } DEF_OP(LoadMemTSO) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto MemReg = GetReg(Op->Addr); if (Op->Class == IR::RegClass::GPR) { LOGMAN_THROW_A_FMT(Op->Offset.IsInvalid() || CTX->HostFeatures.SupportsTSOImm9, "unexpected offset"); LOGMAN_THROW_A_FMT(Op->OffsetScale == 1, "unexpected offset scale"); LOGMAN_THROW_A_FMT(Op->OffsetType == IR::MemOffsetType::SXTX, "unexpected offset type"); } if (CTX->HostFeatures.SupportsTSOImm9 && Op->Class == IR::RegClass::GPR) { const auto Dst = GetReg(Node); uint64_t Offset = 0; if (!Op->Offset.IsInvalid()) { bool IsInline = IsInlineConstant(Op->Offset, &Offset); LOGMAN_THROW_A_FMT(IsInline, "expected immediate"); } if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment const auto Dst = GetReg(Node); ldapurb(Dst, MemReg, Offset); } else { switch (OpSize) { case IR::OpSize::i16Bit: ldapurh(Dst, MemReg, Offset); break; case IR::OpSize::i32Bit: ldapur(Dst.W(), MemReg, Offset); break; case IR::OpSize::i64Bit: ldapur(Dst.X(), MemReg, Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } // Half-barrier once back-patched. nop(); } } else if (CTX->HostFeatures.SupportsRCPC && Op->Class == IR::RegClass::GPR) { const auto Dst = GetReg(Node); if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment ldaprb(Dst.W(), MemReg); } else { switch (OpSize) { case IR::OpSize::i16Bit: ldaprh(Dst.W(), MemReg); break; case IR::OpSize::i32Bit: ldapr(Dst.W(), MemReg); break; case IR::OpSize::i64Bit: ldapr(Dst.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } // Half-barrier once back-patched. nop(); } } else if (Op->Class == IR::RegClass::GPR) { const auto Dst = GetReg(Node); if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment ldarb(Dst, MemReg); } else { switch (OpSize) { case IR::OpSize::i16Bit: ldarh(Dst, MemReg); break; case IR::OpSize::i32Bit: ldar(Dst.W(), MemReg); break; case IR::OpSize::i64Bit: ldar(Dst.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } // Half-barrier once back-patched. nop(); } } else { const auto Dst = GetVReg(Node); const auto MemSrc = GenerateMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, MemSrc); break; case IR::OpSize::i16Bit: ldrh(Dst, MemSrc); break; case IR::OpSize::i32Bit: ldr(Dst.S(), MemSrc); break; case IR::OpSize::i64Bit: ldr(Dst.D(), MemSrc); break; case IR::OpSize::i128Bit: ldr(Dst.Q(), MemSrc); break; case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), MemSrc); break; } default: LOGMAN_MSG_A_FMT("Unhandled LoadMemTSO size: {}", OpSize); break; } if (CTX->IsVectorAtomicTSOEnabled()) { // Half-barrier. dmb(ARMEmitter::BarrierScope::ISHLD); } } } DEF_OP(VLoadVectorMasked) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto CMPPredicate = ARMEmitter::PReg::p0; const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B : PRED_TMP_16B; const auto Dst = GetVReg(Node); const auto MaskReg = GetVReg(Op->Mask); const auto MemReg = GetReg(Op->Addr); if (HostSupportsSVE128 || HostSupportsSVE256) { const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); // Check if the sign bit is set for the given element size. cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0); switch (IROp->ElementSize) { case IR::OpSize::i8Bit: { ld1b(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } case IR::OpSize::i16Bit: { ld1h(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } case IR::OpSize::i32Bit: { ld1w(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } case IR::OpSize::i64Bit: { ld1d(Dst.Z(), CMPPredicate.Zeroing(), MemSrc); break; } default: break; } } else { const auto PerformMove = [this](IR::OpSize ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { switch (ElementSize) { case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break; } }; // Prepare yourself adventurer. For a masked load without instructions that implement it. LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Only supports 128-bit without SVE256"); size_t NumElements = IR::NumElements(IROp->Size, IROp->ElementSize); // Use VTMP1 as the temporary destination auto TempDst = VTMP1; auto WorkingReg = TMP1; auto TempMemReg = MemReg; movi(ARMEmitter::SubRegSize::i64Bit, TempDst.Q(), 0); uint64_t Const {}; if (Op->Offset.IsInvalid()) { // Intentional no-op. } else if (IsInlineConstant(Op->Offset, &Const)) { TempMemReg = TMP2; add(ARMEmitter::Size::i64Bit, TMP2, MemReg, Const); } else { LOGMAN_MSG_A_FMT("Complex addressing requested and not supported!"); } const uint64_t ElementSizeInBits = IR::OpSizeAsBits(IROp->ElementSize); for (size_t i = 0; i < NumElements; ++i) { // Extract the mask element. PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i); // If the sign bit is zero then skip the load ARMEmitter::ForwardLabel Skip {}; (void)tbz(WorkingReg, ElementSizeInBits - 1, &Skip); // Do the gather load for this element into the destination switch (IROp->ElementSize) { case IR::OpSize::i8Bit: ld1(TempDst.Q(), i, TempMemReg); break; case IR::OpSize::i16Bit: ld1(TempDst.Q(), i, TempMemReg); break; case IR::OpSize::i32Bit: ld1(TempDst.Q(), i, TempMemReg); break; case IR::OpSize::i64Bit: ld1(TempDst.Q(), i, TempMemReg); break; case IR::OpSize::i128Bit: ldr(TempDst.Q(), TempMemReg, 0); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); return; } (void)Bind(&Skip); if ((i + 1) != NumElements) { // Handle register rename to save a move. auto WorkingReg = TempMemReg; TempMemReg = TMP2; add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IR::OpSizeToSize(IROp->ElementSize)); } } // Move result. mov(Dst.Q(), TempDst.Q()); } } DEF_OP(VStoreVectorMasked) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto CMPPredicate = ARMEmitter::PReg::p0; const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B : PRED_TMP_16B; const auto RegData = GetVReg(Op->Data); const auto MaskReg = GetVReg(Op->Mask); const auto MemReg = GetReg(Op->Addr); if (HostSupportsSVE128 || HostSupportsSVE256) { const auto MemDst = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); // Check if the sign bit is set for the given element size. cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0); switch (IROp->ElementSize) { case IR::OpSize::i8Bit: { st1b(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } case IR::OpSize::i16Bit: { st1h(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } case IR::OpSize::i32Bit: { st1w(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } case IR::OpSize::i64Bit: { st1d(RegData.Z(), CMPPredicate.Zeroing(), MemDst); break; } default: break; } } else { const auto PerformMove = [this](IR::OpSize ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { switch (ElementSize) { case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break; } }; // Prepare yourself adventurer. For a masked store without instructions that implement it. LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "Only supports 128-bit without SVE256"); size_t NumElements = IR::NumElements(IROp->Size, IROp->ElementSize); // Use VTMP1 as the temporary destination auto WorkingReg = TMP1; auto TempMemReg = MemReg; uint64_t Const {}; if (Op->Offset.IsInvalid()) { // Intentional no-op. } else if (IsInlineConstant(Op->Offset, &Const)) { TempMemReg = TMP2; add(ARMEmitter::Size::i64Bit, TMP2, MemReg, Const); } else { LOGMAN_MSG_A_FMT("Complex addressing requested and not supported!"); } const uint64_t ElementSizeInBits = IR::OpSizeAsBits(IROp->ElementSize); for (size_t i = 0; i < NumElements; ++i) { // Extract the mask element. PerformMove(IROp->ElementSize, WorkingReg, MaskReg, i); // If the sign bit is zero then skip the load ARMEmitter::ForwardLabel Skip {}; (void)tbz(WorkingReg, ElementSizeInBits - 1, &Skip); // Do the gather load for this element into the destination switch (IROp->ElementSize) { case IR::OpSize::i8Bit: st1(RegData.Q(), i, TempMemReg); break; case IR::OpSize::i16Bit: st1(RegData.Q(), i, TempMemReg); break; case IR::OpSize::i32Bit: st1(RegData.Q(), i, TempMemReg); break; case IR::OpSize::i64Bit: st1(RegData.Q(), i, TempMemReg); break; case IR::OpSize::i128Bit: str(RegData.Q(), TempMemReg, 0); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, IROp->ElementSize); return; } (void)Bind(&Skip); if ((i + 1) != NumElements) { // Handle register rename to save a move. auto WorkingReg = TempMemReg; TempMemReg = TMP2; add(ARMEmitter::Size::i64Bit, TempMemReg, WorkingReg, IR::OpSizeToSize(IROp->ElementSize)); } } } } void Arm64JITCore::Emulate128BitGather(IR::OpSize Size, IR::OpSize ElementSize, ARMEmitter::VRegister Dst, ARMEmitter::VRegister IncomingDst, std::optional BaseAddr, ARMEmitter::VRegister VectorIndexLow, std::optional VectorIndexHigh, ARMEmitter::VRegister MaskReg, IR::OpSize VectorIndexSize, size_t DataElementOffsetStart, size_t IndexElementOffsetStart, uint8_t OffsetScale, IR::OpSize AddrSize) { LOGMAN_THROW_A_FMT(ElementSize >= IR::OpSize::i8Bit && ElementSize <= IR::OpSize::i64Bit, "Invalid element size"); const auto PerformSMove = [this](IR::OpSize ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { switch (ElementSize) { case IR::OpSize::i8Bit: smov(Dst.X(), Vector, index); break; case IR::OpSize::i16Bit: smov(Dst.X(), Vector, index); break; case IR::OpSize::i32Bit: smov(Dst.X(), Vector, index); break; case IR::OpSize::i64Bit: umov(Dst.X(), Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break; } }; const auto PerformMove = [this](IR::OpSize ElementSize, const ARMEmitter::Register Dst, const ARMEmitter::VRegister Vector, int index) { switch (ElementSize) { case IR::OpSize::i8Bit: umov(Dst, Vector, index); break; case IR::OpSize::i16Bit: umov(Dst, Vector, index); break; case IR::OpSize::i32Bit: umov(Dst, Vector, index); break; case IR::OpSize::i64Bit: umov(Dst, Vector, index); break; default: LOGMAN_MSG_A_FMT("Unhandled ExtractElementSize: {}", ElementSize); break; } }; // FEX needs to use a temporary destination vector register in a couple of instances. // When Dst overlaps MaskReg, VectorIndexLow, or VectorIndexHigh // Due to x86 gather instruction limitations, it is highly likely that a destination temporary isn't required. const bool NeedsDestTmp = Dst == MaskReg || Dst == VectorIndexLow || (VectorIndexHigh.has_value() && Dst == *VectorIndexHigh); // If the incoming destination isn't the destination then we need to move. const bool NeedsIncomingDestMove = Dst != IncomingDst || NeedsDestTmp; ///< Adventurers beware, emulated ASIMD style gather masked load operation. // Number of elements to load is calculated by the number of index elements available. size_t NumAddrElements = (VectorIndexHigh.has_value() ? 32 : 16) / IR::OpSizeToSize(VectorIndexSize); // The number of elements is clamped by the resulting register size. size_t NumDataElements = std::min(IR::OpSizeToSize(Size) / IR::OpSizeToSize(ElementSize), NumAddrElements); size_t IndexElementsSizeBytes = NumAddrElements * IR::OpSizeToSize(VectorIndexSize); if (IndexElementsSizeBytes > 16) { // We must have a high register in this case. LOGMAN_THROW_A_FMT(VectorIndexHigh.has_value(), "Need High vector index register!"); } auto ResultReg = Dst; if (NeedsDestTmp) { // Use VTMP1 as the temporary destination ResultReg = VTMP1; } auto WorkingReg = TMP1; auto TempMemReg = TMP2; const uint64_t ElementSizeInBits = IR::OpSizeToSize(ElementSize) * 8; if (NeedsIncomingDestMove) { mov(ResultReg.Q(), IncomingDst.Q()); } for (size_t i = DataElementOffsetStart, IndexElement = IndexElementOffsetStart; i < NumDataElements; ++i, ++IndexElement) { ARMEmitter::ForwardLabel Skip {}; // Extract mask element PerformMove(ElementSize, WorkingReg, MaskReg, i); // Skip if the mask's sign bit isn't set (void)tbz(WorkingReg, ElementSizeInBits - 1, &Skip); // Extract Index Element if ((IndexElement * IR::OpSizeToSize(VectorIndexSize)) >= 16) { // Fetch from the high index register. PerformSMove(VectorIndexSize, WorkingReg, *VectorIndexHigh, IndexElement - (16 / IR::OpSizeToSize(VectorIndexSize))); } else { // Fetch from the low index register. PerformSMove(VectorIndexSize, WorkingReg, VectorIndexLow, IndexElement); } // Calculate memory position for this gather load if (BaseAddr.has_value()) { if (VectorIndexSize == IR::OpSize::i32Bit) { add(ConvertSize(AddrSize), TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ExtendedType::SXTW, FEXCore::ilog2(OffsetScale)); } else { add(ConvertSize(AddrSize), TempMemReg, *BaseAddr, WorkingReg, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(OffsetScale)); } } else { ///< In this case we have no base address, All addresses come from the vector register itself if (VectorIndexSize == IR::OpSize::i32Bit) { // Sign extend and shift in to the 64-bit register sbfiz(ConvertSize(AddrSize), TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale), 32); } else { lsl(ConvertSize(AddrSize), TempMemReg, WorkingReg, FEXCore::ilog2(OffsetScale)); } } // Now that the address is calculated. Do the load. switch (ElementSize) { case IR::OpSize::i8Bit: ld1(ResultReg.Q(), i, TempMemReg); break; case IR::OpSize::i16Bit: ld1(ResultReg.Q(), i, TempMemReg); break; case IR::OpSize::i32Bit: ld1(ResultReg.Q(), i, TempMemReg); break; case IR::OpSize::i64Bit: ld1(ResultReg.Q(), i, TempMemReg); break; case IR::OpSize::i128Bit: ldr(ResultReg.Q(), TempMemReg, 0); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ElementSize); FEX_UNREACHABLE; } (void)Bind(&Skip); } if (NeedsDestTmp) { // Move result. mov(Dst.Q(), ResultReg.Q()); } } DEF_OP(VLoadVectorGatherMasked) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto VectorIndexSize = Op->VectorIndexElementSize; const auto OffsetScale = Op->OffsetScale; const auto DataElementOffsetStart = Op->DataElementOffsetStart; const auto IndexElementOffsetStart = Op->IndexElementOffsetStart; ///< This IR operation handles discontiguous masked gather loadstore instructions. Some things to note about its behaviour. /// - VSIB behaviour is mostly entirely exposed in the IR operation directly. /// - Displacement is the only value missing as that can be added directly to AddrBase. /// - VectorIndex{Low,High} contains the index offsets for each element getting loaded. /// - These element sizes are decoupled from the resulting element size. These can be 32-bit or 64-bit. /// - When the element size is 32-bit then the value is zero-extended to the full 64-bit address calculation /// - When loading a 128-bit result with 64-bit VectorIndex Elements, this requires the use of both VectorIndexLow and VectorIndexHigh /// to get enough pointers. /// - When VectorIndexElementSize and OffsetScale matches Arm64 SVE behaviour then the operation becomes more optimal /// - When the behaviour doesn't match then it gets decomposed to ASIMD style masked load. /// - AddrBase also doesn't need to exist /// - If the instruction is using 64-bit vector indexing or 32-bit addresses where the top-bit isn't set then this is valid! const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto IncomingDst = GetVReg(Op->Incoming); const auto MaskReg = GetVReg(Op->Mask); std::optional BaseAddr = !Op->AddrBase.IsInvalid() ? std::make_optional(GetReg(Op->AddrBase)) : std::nullopt; const auto VectorIndexLow = GetVReg(Op->VectorIndexLow); std::optional VectorIndexHigh = !Op->VectorIndexHigh.IsInvalid() ? std::make_optional(GetVReg(Op->VectorIndexHigh)) : std::nullopt; ///< If the host supports SVE and the offset scale matches SVE limitations then it can do an SVE style load. const bool SupportsSVELoad = (HostSupportsSVE128 || HostSupportsSVE256) && (OffsetScale == 1 || OffsetScale == IR::OpSizeToSize(VectorIndexSize)) && VectorIndexSize == IROp->ElementSize && Op->AddrSize == IR::OpSize::i64Bit; if (SupportsSVELoad) { uint8_t SVEScale = FEXCore::ilog2(OffsetScale); ARMEmitter::SVEModType ModType = ARMEmitter::SVEModType::MOD_NONE; if (VectorIndexSize == IR::OpSize::i32Bit) { ModType = ARMEmitter::SVEModType::MOD_SXTW; } else if (VectorIndexSize == IR::OpSize::i64Bit && OffsetScale != 1) { ModType = ARMEmitter::SVEModType::MOD_LSL; } const auto SubRegSize = ConvertSubRegSize8(IROp); const auto CMPPredicate = ARMEmitter::PReg::p0; const auto GoverningPredicate = Is256Bit ? PRED_TMP_32B : PRED_TMP_16B; // Check if the sign bit is set for the given element size. cmplt(SubRegSize, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0); auto TempDst = VTMP1; // No need to load a temporary register in the case that we weren't provided a base address and there is no scaling. ARMEmitter::SVEMemOperand MemDst {ARMEmitter::SVEMemOperand(VectorIndexLow.Z(), 0)}; if (BaseAddr.has_value() || OffsetScale != 1) { ARMEmitter::Register AddrReg = TMP1; if (BaseAddr.has_value()) { AddrReg = GetReg(Op->AddrBase); } else { ///< OpcodeDispatcher didn't provide a Base address while SVE requires one. LoadConstant(ARMEmitter::Size::i64Bit, AddrReg, 0); } MemDst = ARMEmitter::SVEMemOperand(AddrReg.X(), VectorIndexLow.Z(), ModType, SVEScale); } switch (IROp->ElementSize) { case IR::OpSize::i8Bit: { ld1b(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } case IR::OpSize::i16Bit: { ld1h(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } case IR::OpSize::i32Bit: { ld1w(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } case IR::OpSize::i64Bit: { ld1d(TempDst.Z(), CMPPredicate.Zeroing(), MemDst); break; } default: break; } ///< Merge elements based on predicate. sel(SubRegSize, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z()); } else { LOGMAN_THROW_A_FMT(!Is256Bit, "Can't emulate this gather load in the backend! Programming error!"); Emulate128BitGather(IROp->Size, IROp->ElementSize, Dst, IncomingDst, BaseAddr, VectorIndexLow, VectorIndexHigh, MaskReg, VectorIndexSize, DataElementOffsetStart, IndexElementOffsetStart, OffsetScale, Op->AddrSize); } } DEF_OP(VLoadVectorGatherMaskedQPS) { const auto Op = IROp->C(); /// This instruction behaves similarly to the non-QPS version except for some STRICT limitations /// - Only supports 32-bit element data size! /// - Only supports 64-bit element address size! /// - Only masks elements based on 32-bit element data size! (NOT ADDR SIZE!) /// - Optimally uses SVE's `ld1w {zt.D}` variant instruction! /// - Only outputs a single 128-bit result, while consuming 128-bit or 256-bit of address indexes! /// - Matches VGATHERQPS/VPGATHERQD behaviour! const auto OffsetScale = Op->OffsetScale; const auto Dst = GetVReg(Node); const auto IncomingDst = GetVReg(Op->Incoming); const auto MaskReg = GetVReg(Op->MaskReg); std::optional BaseAddr = !Op->AddrBase.IsInvalid() ? std::make_optional(GetReg(Op->AddrBase)) : std::nullopt; const auto VectorIndexLow = GetVReg(Op->VectorIndexLow); std::optional VectorIndexHigh = !Op->VectorIndexHigh.IsInvalid() ? std::make_optional(GetVReg(Op->VectorIndexHigh)) : std::nullopt; ///< If the host supports SVE and the offset scale matches SVE limitations then it can do an SVE style load. const bool SupportsSVELoad = HostSupportsSVE128 && (OffsetScale == 1 || OffsetScale == 4) && Op->AddrSize == IR::OpSize::i64Bit; if (SupportsSVELoad) { ARMEmitter::SVEModType ModType = ARMEmitter::SVEModType::MOD_NONE; if (OffsetScale != 1) { ModType = ARMEmitter::SVEModType::MOD_LSL; } const auto CMPPredicate = ARMEmitter::PReg::p0; const auto CMPPredicate2 = ARMEmitter::PReg::p1; const auto GoverningPredicate = PRED_TMP_16B; // Check if the sign bit is set for the given element size. // This will set the predicate bits for elements [0, 1, 2, 3] // We then use punpklo to extend the low results to be for 64-bit elements. cmplt(ARMEmitter::SubRegSize::i32Bit, CMPPredicate, GoverningPredicate.Zeroing(), MaskReg.Z(), 0); punpklo(CMPPredicate2, CMPPredicate); auto TempDst = VTMP1; auto GatherExtend = [this](ARMEmitter::VRegister Dst, std::optional BaseAddr, ARMEmitter::VRegister VectorIndex, ARMEmitter::PRegister CMPPredicate, ARMEmitter::SVEModType ModType, uint8_t OffsetScale) { // No need to load a temporary register in the case that we weren't provided a base address and there is no scaling. uint8_t SVEScale = FEXCore::ilog2(OffsetScale); ARMEmitter::SVEMemOperand MemDst {ARMEmitter::SVEMemOperand(VectorIndex.Z(), 0)}; if (BaseAddr.has_value() || OffsetScale != 1) { ARMEmitter::Register AddrReg = TMP1; if (BaseAddr.has_value()) { AddrReg = *BaseAddr; } else { ///< OpcodeDispatcher didn't provide a Base address while SVE requires one. LoadConstant(ARMEmitter::Size::i64Bit, AddrReg, 0); } MemDst = ARMEmitter::SVEMemOperand(AddrReg.X(), VectorIndex.Z(), ModType, SVEScale); } ld1w(Dst.Z(), CMPPredicate.Zeroing(), MemDst); }; GatherExtend(TempDst, BaseAddr, VectorIndexLow, CMPPredicate2, ModType, OffsetScale); if (VectorIndexHigh.has_value()) { punpkhi(CMPPredicate2, CMPPredicate); GatherExtend(VTMP2, BaseAddr, *VectorIndexHigh, CMPPredicate2, ModType, OffsetScale); // Move elements to the lower half. uzp1(ARMEmitter::SubRegSize::i32Bit, TempDst.Q(), TempDst.Q(), VTMP2.Q()); ///< Merge elements based on predicate. sel(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z()); } else { // Move elements to the lower half. xtn(ARMEmitter::SubRegSize::i32Bit, TempDst.Q(), TempDst.Q()); ///< Merge elements based on predicate. sel(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), CMPPredicate, TempDst.Z(), IncomingDst.Z()); } } else { Emulate128BitGather(IR::OpSize::i128Bit, IR::OpSize::i32Bit, Dst, IncomingDst, BaseAddr, VectorIndexLow, VectorIndexHigh, MaskReg, IR::OpSize::i64Bit, 0, 0, OffsetScale, Op->AddrSize); } } DEF_OP(VLoadVectorElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; const auto ElementSize = IROp->ElementSize; const auto Dst = GetVReg(Node); const auto DstSrc = GetVReg(Op->DstSrc); const auto MemReg = GetReg(Op->Addr); LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, "Invalid element " "size"); if (Is256Bit) { LOGMAN_MSG_A_FMT("Unsupported 256-bit VLoadVectorElement"); } else { if (Dst != DstSrc && ElementSize != IR::OpSize::i128Bit) { mov(Dst.Q(), DstSrc.Q()); } switch (ElementSize) { case IR::OpSize::i8Bit: ld1(Dst.Q(), Op->Index, MemReg); break; case IR::OpSize::i16Bit: ld1(Dst.Q(), Op->Index, MemReg); break; case IR::OpSize::i32Bit: ld1(Dst.Q(), Op->Index, MemReg); break; case IR::OpSize::i64Bit: ld1(Dst.Q(), Op->Index, MemReg); break; case IR::OpSize::i128Bit: ldr(Dst.Q(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ElementSize); return; } } // Emit a half-barrier if TSO is enabled. if (CTX->IsVectorAtomicTSOEnabled()) { dmb(ARMEmitter::BarrierScope::ISHLD); } } DEF_OP(VStoreVectorElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; const auto ElementSize = IROp->ElementSize; const auto Value = GetVReg(Op->Value); const auto MemReg = GetReg(Op->Addr); LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, "Invalid element " "size"); // Emit a half-barrier if TSO is enabled. if (CTX->IsVectorAtomicTSOEnabled()) { dmb(ARMEmitter::BarrierScope::ISH); } if (Is256Bit) { LOGMAN_MSG_A_FMT("Unsupported 256-bit {}", __func__); } else { switch (ElementSize) { case IR::OpSize::i8Bit: st1(Value.Q(), Op->Index, MemReg); break; case IR::OpSize::i16Bit: st1(Value.Q(), Op->Index, MemReg); break; case IR::OpSize::i32Bit: st1(Value.Q(), Op->Index, MemReg); break; case IR::OpSize::i64Bit: st1(Value.Q(), Op->Index, MemReg); break; case IR::OpSize::i128Bit: str(Value.Q(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ElementSize); return; } } } DEF_OP(VBroadcastFromMem) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = IROp->ElementSize; const auto Dst = GetVReg(Node); const auto MemReg = GetReg(Op->Address); LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit || ElementSize == IR::OpSize::i128Bit, "Invalid element " "size"); if (Is256Bit && HostSupportsSVE256) { const auto GoverningPredicate = PRED_TMP_32B.Zeroing(); switch (ElementSize) { case IR::OpSize::i8Bit: ld1rb(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), GoverningPredicate, MemReg); break; case IR::OpSize::i16Bit: ld1rh(ARMEmitter::SubRegSize::i16Bit, Dst.Z(), GoverningPredicate, MemReg); break; case IR::OpSize::i32Bit: ld1rw(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), GoverningPredicate, MemReg); break; case IR::OpSize::i64Bit: ld1rd(Dst.Z(), GoverningPredicate, MemReg); break; case IR::OpSize::i128Bit: ld1rqb(Dst.Z(), GoverningPredicate, MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled VBroadcastFromMem size: {}", ElementSize); return; } } else { switch (ElementSize) { case IR::OpSize::i8Bit: ld1r(Dst.Q(), MemReg); break; case IR::OpSize::i16Bit: ld1r(Dst.Q(), MemReg); break; case IR::OpSize::i32Bit: ld1r(Dst.Q(), MemReg); break; case IR::OpSize::i64Bit: ld1r(Dst.Q(), MemReg); break; case IR::OpSize::i128Bit: // Normal load, like ld1rqb with 128-bit regs. ldr(Dst.Q(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled VBroadcastFromMem size: {}", ElementSize); return; } } // Emit a half-barrier if TSO is enabled. if (CTX->IsVectorAtomicTSOEnabled()) { dmb(ARMEmitter::BarrierScope::ISHLD); } } DEF_OP(Push) { const auto Op = IROp->C(); const auto ValueSize = IR::OpSizeToSize(Op->ValueSize); auto Src = GetReg(Op->Value); const auto AddrSrc = GetReg(Op->Addr); const auto Dst = GetReg(Node); bool NeedsMoveAfterwards = false; if (Dst != AddrSrc) { if (Dst == Src) { NeedsMoveAfterwards = true; // Need to be careful here, incoming source might be reused afterwards. } else { // RA constraints would let this always be true. mov(IROp->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit, Dst, AddrSrc); } } if (Src == AddrSrc) { // If the data source is the address source then we need to do some additional work. // This is because it is undefined behaviour to do a writeback on store operation where dest == src. // In the case of writeback where the source is the address there are multiple behaviours. // - SIGILL - Apple Silicon Behaviour // - Stores original value - Cortex behaviour // - Stores value after pre-index adjust adjust - Vixl simulator behaviour. // - Undefined value stored // - Undefined behaviour(!) // In this path Src can end up overlapping both AddrSrc and Dst. // Move the data to a temporary and store from there instead. mov(TMP1, Src.X()); Src = TMP1; } if (NeedsMoveAfterwards) { switch (ValueSize) { case 1: { sturb(Src.W(), AddrSrc, -ValueSize); break; } case 2: { sturh(Src.W(), AddrSrc, -ValueSize); break; } case 4: { stur(Src.W(), AddrSrc, -ValueSize); break; } case 8: { stur(Src.X(), AddrSrc, -ValueSize); break; } default: { LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ValueSize); break; } } sub(IROp->Size == IR::OpSize::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit, Dst, AddrSrc, ValueSize); } else { switch (ValueSize) { case 1: { strb(Src.W(), Dst, -ValueSize); break; } case 2: { strh(Src.W(), Dst, -ValueSize); break; } case 4: { str(Src.W(), Dst, -ValueSize); break; } case 8: { str(Src.X(), Dst, -ValueSize); break; } default: { LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ValueSize); break; } } } } DEF_OP(PushTwo) { const auto Op = IROp->C(); const auto ValueSize = IR::OpSizeToSize(Op->ValueSize); auto Src1 = GetReg(Op->Value1); auto Src2 = GetReg(Op->Value2); const auto Dst = GetReg(Op->Addr); switch (ValueSize) { case 4: { stp(Src1.W(), Src2.W(), Dst, -2 * ValueSize); break; } case 8: { stp(Src1.X(), Src2.X(), Dst, -2 * ValueSize); break; } default: { LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, ValueSize); break; } } } DEF_OP(Pop) { const auto Op = IROp->C(); const auto Size = IR::OpSizeToSize(Op->Size); const auto Addr = GetReg(Op->InoutAddr); const auto Dst = GetReg(Op->OutValue); LOGMAN_THROW_A_FMT(Dst != Addr, "Invalid"); switch (Size) { case 1: { ldrb(Dst.W(), Addr, Size); break; } case 2: { ldrh(Dst.W(), Addr, Size); break; } case 4: { ldr(Dst.W(), Addr, Size); break; } case 8: { ldr(Dst.X(), Addr, Size); break; } default: { LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Op->Size); break; } } } DEF_OP(PopTwo) { const auto Op = IROp->C(); const auto Size = IR::OpSizeToSize(Op->Size); const auto Addr = GetReg(Op->InoutAddr); auto Dst1 = GetReg(Op->OutValue1); const auto Dst2 = GetReg(Op->OutValue2); // ldp x, x is invalid. Explicitly discard the first destination to encode. if (Dst1 == Dst2) { Dst1 = ARMEmitter::Reg::zr; } LOGMAN_THROW_A_FMT(Dst1 != Addr && Dst2 != Addr, "Invalid"); LOGMAN_THROW_A_FMT(Dst1 != Dst2, "Invalid"); switch (Size) { case 4: { ldp(Dst1.W(), Dst2.W(), Addr, 2 * Size); break; } case 8: { ldp(Dst1.X(), Dst2.X(), Addr, 2 * Size); break; } default: { LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Op->Size); break; } } } DEF_OP(StoreMem) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto MemReg = GetReg(Op->Addr); const auto MemSrc = GenerateMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); if (Op->Class == IR::RegClass::GPR) { const auto Src = GetZeroableReg(Op->Value); switch (OpSize) { case IR::OpSize::i8Bit: strb(Src, MemSrc); break; case IR::OpSize::i16Bit: strh(Src, MemSrc); break; case IR::OpSize::i32Bit: str(Src.W(), MemSrc); break; case IR::OpSize::i64Bit: str(Src.X(), MemSrc); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break; } } else { const auto Src = GetVReg(Op->Value); switch (OpSize) { case IR::OpSize::i8Bit: { strb(Src, MemSrc); break; } case IR::OpSize::i16Bit: { strh(Src, MemSrc); break; } case IR::OpSize::i32Bit: { str(Src.S(), MemSrc); break; } case IR::OpSize::i64Bit: { str(Src.D(), MemSrc); break; } case IR::OpSize::i128Bit: { str(Src.Q(), MemSrc); break; } case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MemSrc = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); st1b(Src.Z(), PRED_TMP_32B, MemSrc); break; } default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break; } } } DEF_OP(StoreMemX87SVEOptPredicate) { const auto Op = IROp->C(); const auto Predicate = PRED_X87_SVEOPT; LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemX87SVEOptPredicate needs SVE support"); const auto RegData = GetVReg(Op->Value); const auto MemReg = GetReg(Op->Addr); const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); switch (IROp->ElementSize) { case IR::OpSize::i8Bit: { st1b(RegData.Z(), Predicate, MemDst); break; } case IR::OpSize::i16Bit: { st1h(RegData.Z(), Predicate, MemDst); break; } case IR::OpSize::i32Bit: { st1w(RegData.Z(), Predicate, MemDst); break; } case IR::OpSize::i64Bit: { st1d(RegData.Z(), Predicate, MemDst); break; } default: LOGMAN_MSG_A_FMT("Unhandled {} element size: {}", __func__, IROp->ElementSize); break; } } DEF_OP(LoadMemX87SVEOptPredicate) { const auto Op = IROp->C(); const auto Dst = GetVReg(Node); const auto Predicate = PRED_X87_SVEOPT; const auto MemReg = GetReg(Op->Addr); LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemX87SVEOptPredicate needs SVE support"); const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); switch (IROp->ElementSize) { case IR::OpSize::i8Bit: { ld1b(Dst.Z(), Predicate.Zeroing(), MemDst); break; } case IR::OpSize::i16Bit: { ld1h(Dst.Z(), Predicate.Zeroing(), MemDst); break; } case IR::OpSize::i32Bit: { ld1w(Dst.Z(), Predicate.Zeroing(), MemDst); break; } case IR::OpSize::i64Bit: { ld1d(Dst.Z(), Predicate.Zeroing(), MemDst); break; } default: LOGMAN_MSG_A_FMT("Unhandled {} element size: {}", __func__, IROp->ElementSize); break; } } DEF_OP(StoreMemPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Addr = GetReg(Op->Addr); if (Op->Class == IR::RegClass::GPR) { const auto Src1 = GetZeroableReg(Op->Value1); const auto Src2 = GetZeroableReg(Op->Value2); switch (OpSize) { case IR::OpSize::i32Bit: stp(Src1.W(), Src2.W(), Addr, Op->Offset); break; case IR::OpSize::i64Bit: stp(Src1.X(), Src2.X(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMem size: {}", OpSize); break; } } else { const auto Src1 = GetVReg(Op->Value1); const auto Src2 = GetVReg(Op->Value2); switch (OpSize) { case IR::OpSize::i32Bit: stp(Src1.S(), Src2.S(), Addr, Op->Offset); break; case IR::OpSize::i64Bit: stp(Src1.D(), Src2.D(), Addr, Op->Offset); break; case IR::OpSize::i128Bit: stp(Src1.Q(), Src2.Q(), Addr, Op->Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMemPair size: {}", OpSize); break; } } } DEF_OP(StoreMemTSO) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto MemReg = GetReg(Op->Addr); if (Op->Class == IR::RegClass::GPR) { LOGMAN_THROW_A_FMT(Op->Offset.IsInvalid() || CTX->HostFeatures.SupportsTSOImm9, "unexpected offset"); LOGMAN_THROW_A_FMT(Op->OffsetScale == 1, "unexpected offset scale"); LOGMAN_THROW_A_FMT(Op->OffsetType == IR::MemOffsetType::SXTX, "unexpected offset type"); } if (CTX->HostFeatures.SupportsTSOImm9 && Op->Class == IR::RegClass::GPR) { const auto Src = GetZeroableReg(Op->Value); uint64_t Offset = 0; if (!Op->Offset.IsInvalid()) { bool IsInline = IsInlineConstant(Op->Offset, &Offset); LOGMAN_THROW_A_FMT(IsInline, "expected immediate"); } if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment stlurb(Src, MemReg, Offset); } else { // Half-barrier once back-patched. nop(); switch (OpSize) { case IR::OpSize::i16Bit: stlurh(Src, MemReg, Offset); break; case IR::OpSize::i32Bit: stlur(Src.W(), MemReg, Offset); break; case IR::OpSize::i64Bit: stlur(Src.X(), MemReg, Offset); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMemTSO size: {}", OpSize); break; } } } else if (Op->Class == IR::RegClass::GPR) { const auto Src = GetZeroableReg(Op->Value); if (OpSize == IR::OpSize::i8Bit) { // 8bit load is always aligned to natural alignment stlrb(Src, MemReg); } else { // Half-barrier once back-patched. nop(); switch (OpSize) { case IR::OpSize::i16Bit: stlrh(Src, MemReg); break; case IR::OpSize::i32Bit: stlr(Src.W(), MemReg); break; case IR::OpSize::i64Bit: stlr(Src.X(), MemReg); break; default: LOGMAN_MSG_A_FMT("Unhandled StoreMemTSO size: {}", OpSize); break; } } } else { if (CTX->IsVectorAtomicTSOEnabled()) { // Half-Barrier. dmb(ARMEmitter::BarrierScope::ISH); } const auto Src = GetVReg(Op->Value); const auto MemSrc = GenerateMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); switch (OpSize) { case IR::OpSize::i8Bit: strb(Src, MemSrc); break; case IR::OpSize::i16Bit: strh(Src, MemSrc); break; case IR::OpSize::i32Bit: str(Src.S(), MemSrc); break; case IR::OpSize::i64Bit: str(Src.D(), MemSrc); break; case IR::OpSize::i128Bit: str(Src.Q(), MemSrc); break; case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Operand = GenerateSVEMemOperand(OpSize, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); st1b(Src.Z(), PRED_TMP_32B, Operand); break; } default: LOGMAN_MSG_A_FMT("Unhandled StoreMemTSO size: {}", OpSize); break; } } } DEF_OP(MemSet) { const auto Op = IROp->C(); const bool IsAtomic = CTX->IsMemcpyAtomicTSOEnabled(); const auto Size = IR::OpSizeToSize(Op->Size); const auto MemReg = GetReg(Op->Addr); const auto Value = GetZeroableReg(Op->Value); const auto Length = GetReg(Op->Length); const auto Dst = GetReg(Node); uint64_t DirectionConstant; bool DirectionIsInline = IsInlineConstant(Op->Direction, &DirectionConstant); ARMEmitter::Register DirectionReg = ARMEmitter::Reg::r0; if (!DirectionIsInline) { DirectionReg = GetReg(Op->Direction); } // If Direction > 0 then: // MemReg is incremented (by size) // else: // MemReg is decremented (by size) // // Counter is decremented regardless. ARMEmitter::ForwardLabel BackwardImpl {}; ARMEmitter::ForwardLabel Done {}; mov(TMP1, Length.X()); if (Op->Prefix.IsInvalid()) { mov(TMP2, MemReg.X()); } else { const auto Prefix = GetReg(Op->Prefix); add(TMP2, Prefix.X(), MemReg.X()); } if (!DirectionIsInline) { // Backward or forwards implementation depends on flag (void)tbnz(DirectionReg, 1, &BackwardImpl); } auto MemStore = [this](auto Value, uint32_t OpSize, int32_t Size) { switch (OpSize) { case 1: strb(Value.W(), TMP2, Size); break; case 2: strh(Value.W(), TMP2, Size); break; case 4: str(Value.W(), TMP2, Size); break; case 8: str(Value.X(), TMP2, Size); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } }; auto MemStoreTSO = [this](auto Value, uint32_t OpSize, int32_t Size) { if (OpSize == 1) { // 8bit load is always aligned to natural alignment stlrb(Value.W(), TMP2); } else { nop(); switch (OpSize) { case 2: stlrh(Value.W(), TMP2); break; case 4: stlr(Value.W(), TMP2); break; case 8: stlr(Value.X(), TMP2); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } } if (Size >= 0) { add(ARMEmitter::Size::i64Bit, TMP2, TMP2, OpSize); } else { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, OpSize); } }; const auto SubRegSize = Size == 1 ? ARMEmitter::SubRegSize::i8Bit : Size == 2 ? ARMEmitter::SubRegSize::i16Bit : Size == 4 ? ARMEmitter::SubRegSize::i32Bit : Size == 8 ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit; auto EmitMemset = [&](int32_t Direction) { const int32_t SizeDirection = Size * Direction; const bool IsBackwards = Direction == -1; // Sets the result to the final address written depending on // whether or not the memset is forwards or backwards. const auto MakeFinalAddress = [&] { if (IsBackwards) { switch (Size) { case 1: sub(Dst.X(), MemReg.X(), Length.X()); break; case 2: case 4: case 8: sub(Dst.X(), MemReg.X(), Length.X(), ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Size)); break; default: LOGMAN_MSG_A_FMT("Unhandled MemSet size: {}", Size); break; } } else { switch (Size) { case 1: add(Dst.X(), MemReg.X(), Length.X()); break; case 2: case 4: case 8: add(Dst.X(), MemReg.X(), Length.X(), ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Size)); break; default: LOGMAN_MSG_A_FMT("Unhandled MemSet size: {}", Size); break; } } }; ARMEmitter::BiDirectionalLabel AgainInternal {}; ARMEmitter::ForwardLabel DoneInternal {}; // Early exit if zero count. (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); if (!IsAtomic) { if (CTX->HostFeatures.SupportsMOPS) { const bool Is8Bit = SubRegSize == ARMEmitter::SubRegSize::i8Bit; // We can handle 8-bit memsets and any other size that happens // to be using an inlined zero value (resulting in the use of ZR). // // NOTE: // Strictly speaking, this can also be trivially expanded to handle other sizes // that happen to use any value that could fit inside a byte if the need // arises. This does increase branching and code generation, however, since // we'd still need to emit the fallback in the event a value for a larger size // falls outside the range of a byte instead of only generating the MOPS code. if (Is8Bit || Value == ARMEmitter::Reg::zr) { // If we're performing a non-byte-sized zeroing operation then we need to // scale the counter accordingly. (e.g. a 64-bit memset of size 2 needs to // be turned into an 8-bit memset of size 16) if (!Is8Bit) { lsl(ARMEmitter::Size::i64Bit, TMP1, TMP1, FEXCore::ToUnderlying(SubRegSize)); } // If backwards, then we need to adjust the starting address because // set{p, m, e} memset forwards, so we need to slide this bad boy // back like: (address - count) + 1. // // This lets us offset the address such that we can treat a backwards // memset as if it were a forwards one. if (IsBackwards) { sub(TMP2, TMP2, TMP1); add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 1); } // Unfortunately set operations fiddle with NZCV, so we need to preserve it. mrs(TMP3, ARMEmitter::SystemRegister::NZCV); setp(TMP2, TMP1, Value.X()); setm(TMP2, TMP1, Value.X()); sete(TMP2, TMP1, Value.X()); msr(ARMEmitter::SystemRegister::NZCV, TMP3); MakeFinalAddress(); (void)Bind(&DoneInternal); return; } } ARMEmitter::ForwardLabel AgainInternal256Exit {}; ARMEmitter::BackwardLabel AgainInternal256 {}; ARMEmitter::ForwardLabel AgainInternal128Exit {}; ARMEmitter::BackwardLabel AgainInternal128 {}; if (IsBackwards) { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); } // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback // to the copy unit size copy loop for the last chunk. // Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the // single copy loop if size < 64. sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbnz(TMP1, 63, &AgainInternal128Exit); // Fill VTMP2 with the set pattern dup(SubRegSize, VTMP2.Q(), Value); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbnz(TMP1, 63, &AgainInternal256Exit); (void)Bind(&AgainInternal256); stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); (void)tbz(TMP1, 63, &AgainInternal256); (void)Bind(&AgainInternal256Exit); add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbnz(TMP1, 63, &AgainInternal128Exit); (void)Bind(&AgainInternal128); stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbz(TMP1, 63, &AgainInternal128); (void)Bind(&AgainInternal128Exit); add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); if (IsBackwards) { add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); } } (void)Bind(&AgainInternal); if (IsAtomic) { MemStoreTSO(Value, Size, SizeDirection); } else { MemStore(Value, Size, SizeDirection); } sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1); (void)cbnz(ARMEmitter::Size::i64Bit, TMP1, &AgainInternal); (void)Bind(&DoneInternal); MakeFinalAddress(); }; if (DirectionIsInline) { LOGMAN_THROW_A_FMT(DirectionConstant == 1 || DirectionConstant == -1, "unexpected direction"); EmitMemset(DirectionConstant); } else { // Emit forward direction memset then backward direction memset. for (int32_t Direction : {1, -1}) { EmitMemset(Direction); if (Direction == 1) { (void)b(&Done); (void)Bind(&BackwardImpl); } } (void)Bind(&Done); // Destination already set to the final pointer. } } DEF_OP(MemCpy) { const auto Op = IROp->C(); const bool IsAtomic = CTX->IsMemcpyAtomicTSOEnabled(); const auto Size = IR::OpSizeToSize(Op->Size); const auto MemRegDest = GetReg(Op->Dest); const auto MemRegSrc = GetReg(Op->Src); const auto Length = GetReg(Op->Length); uint64_t DirectionConstant; bool DirectionIsInline = IsInlineConstant(Op->Direction, &DirectionConstant); ARMEmitter::Register DirectionReg = ARMEmitter::Reg::r0; if (!DirectionIsInline) { DirectionReg = GetReg(Op->Direction); } auto Dst0 = GetReg(Op->OutDstAddress); auto Dst1 = GetReg(Op->OutSrcAddress); // If Direction > 0 then: // MemRegDest is incremented (by size) // MemRegSrc is incremented (by size) // else: // MemRegDest is decremented (by size) // MemRegSrc is decremented (by size) // // Counter is decremented regardless. ARMEmitter::ForwardLabel BackwardImpl {}; ARMEmitter::ForwardLabel Done {}; mov(TMP1, Length.X()); mov(TMP2, MemRegDest.X()); mov(TMP3, MemRegSrc.X()); // TMP1 = Length // TMP2 = Dest // TMP3 = Src // TMP4 = load+store temp value if (!DirectionIsInline) { // Backward or forwards implementation depends on flag (void)tbnz(DirectionReg, 1, &BackwardImpl); } auto MemCpy = [this](uint32_t OpSize, int32_t Size) { switch (OpSize) { case 1: ldrb(TMP4.W(), TMP3, Size); strb(TMP4.W(), TMP2, Size); break; case 2: ldrh(TMP4.W(), TMP3, Size); strh(TMP4.W(), TMP2, Size); break; case 4: ldr(TMP4.W(), TMP3, Size); str(TMP4.W(), TMP2, Size); break; case 8: ldr(TMP4, TMP3, Size); str(TMP4, TMP2, Size); break; case 32: ldp(VTMP1.Q(), VTMP2.Q(), TMP3, Size); stp(VTMP1.Q(), VTMP2.Q(), TMP2, Size); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } }; auto MemCpyTSO = [this](uint32_t OpSize, int32_t Size) { if (CTX->HostFeatures.SupportsRCPC) { if (OpSize == 1) { // 8bit load is always aligned to natural alignment ldaprb(TMP4.W(), TMP3); stlrb(TMP4.W(), TMP2); } else { switch (OpSize) { case 2: ldaprh(TMP4.W(), TMP3); break; case 4: ldapr(TMP4.W(), TMP3); break; case 8: ldapr(TMP4, TMP3); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } // Placeholders for backpatching barriers (one per load/store) nop(); nop(); switch (OpSize) { case 2: stlrh(TMP4.W(), TMP2); break; case 4: stlr(TMP4.W(), TMP2); break; case 8: stlr(TMP4, TMP2); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } } } else { if (OpSize == 1) { // 8bit load is always aligned to natural alignment ldarb(TMP4.W(), TMP3); stlrb(TMP4.W(), TMP2); } else { switch (OpSize) { case 2: ldarh(TMP4.W(), TMP3); break; case 4: ldar(TMP4.W(), TMP3); break; case 8: ldar(TMP4, TMP3); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } // Placeholders for backpatching barriers (one per load/store) nop(); nop(); switch (OpSize) { case 2: stlrh(TMP4.W(), TMP2); break; case 4: stlr(TMP4.W(), TMP2); break; case 8: stlr(TMP4, TMP2); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; } } } if (Size >= 0) { add(ARMEmitter::Size::i64Bit, TMP2, TMP2, OpSize); add(ARMEmitter::Size::i64Bit, TMP3, TMP3, OpSize); } else { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, OpSize); sub(ARMEmitter::Size::i64Bit, TMP3, TMP3, OpSize); } }; auto EmitMemcpy = [&](int32_t Direction) { const int32_t SizeDirection = Size * Direction; const bool IsBackwards = Direction == -1; const auto FinalizeAddresses = [&] { if (IsBackwards) { switch (Size) { case 1: sub(Dst0.X(), TMP1, TMP3); sub(Dst1.X(), TMP2, TMP3); break; case 2: case 4: case 8: sub(Dst0.X(), TMP1, TMP3, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Size)); sub(Dst1.X(), TMP2, TMP3, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Size)); break; default: LOGMAN_MSG_A_FMT("Unhandled MemCpy size: {}", Size); break; } } else { switch (Size) { case 1: add(Dst0.X(), TMP1, TMP3); add(Dst1.X(), TMP2, TMP3); break; case 2: case 4: case 8: add(Dst0.X(), TMP1, TMP3, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Size)); add(Dst1.X(), TMP2, TMP3, ARMEmitter::ShiftType::LSL, FEXCore::ilog2(Size)); break; default: LOGMAN_MSG_A_FMT("Unhandled MemCpy size: {}", Size); break; } } }; ARMEmitter::BiDirectionalLabel AgainInternal {}; ARMEmitter::ForwardLabel DoneInternal {}; // Early exit if zero count. (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); if (!IsAtomic) { if (CTX->HostFeatures.SupportsMOPS) { // In the event we have an overlap (gross), we need to fall back // to the non-mops copy handler. Since the overlap check needs to // make use of NZCV, we need to save it. This can be avoided with // ARMv9.6+'s FEAT_CMPBR, but alas, we don't have access to that right now. // // NOTE: That we need to temporarily trash TMP1 and restore it after the // comparison. ARMEmitter::ForwardLabel OverlapCase; mrs(TMP4, ARMEmitter::SystemRegister::NZCV); sub(ARMEmitter::Size::i64Bit, TMP1, TMP2, TMP3); cmp(ARMEmitter::Size::i64Bit, TMP1, Length.X()); mov(TMP1, Length.X()); (void)bc(ARMEmitter::Condition::CC_LT, &OverlapCase); // If doing something larger than a byte copy, then we need to scale // the counter value accordingly to convert it to bytes. if (Size > 1) { lsl(ARMEmitter::Size::i64Bit, TMP1, TMP1, FEXCore::ilog2(Size)); } // Adjust addresses so that we treat the backward copy as a forward copy if (IsBackwards) { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, TMP1); sub(ARMEmitter::Size::i64Bit, TMP3, TMP3, TMP1); add(ARMEmitter::Size::i64Bit, TMP2, TMP2, Size); add(ARMEmitter::Size::i64Bit, TMP3, TMP3, Size); } // Unfortunately copy operations fiddle with NZCV, so we need to preserve it. cpyfp(TMP2, TMP3, TMP1); cpyfm(TMP2, TMP3, TMP1); cpyfe(TMP2, TMP3, TMP1); msr(ARMEmitter::SystemRegister::NZCV, TMP4); (void)b(&DoneInternal); // Turns out we overlap and need to fall back. Make sure to restore NZCV. (void)Bind(&OverlapCase); msr(ARMEmitter::SystemRegister::NZCV, TMP4); } ARMEmitter::ForwardLabel AbsPos {}; ARMEmitter::ForwardLabel AgainInternal256Exit {}; ARMEmitter::ForwardLabel AgainInternal128Exit {}; ARMEmitter::BackwardLabel AgainInternal128 {}; ARMEmitter::BackwardLabel AgainInternal256 {}; sub(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3); (void)tbz(TMP4, 63, &AbsPos); neg(ARMEmitter::Size::i64Bit, TMP4, TMP4); (void)Bind(&AbsPos); sub(ARMEmitter::Size::i64Bit, TMP4, TMP4, 32); (void)tbnz(TMP4, 63, &AgainInternal); if (IsBackwards) { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); sub(ARMEmitter::Size::i64Bit, TMP3, TMP3, 32 - Size); } // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback // to the copy unit size copy loop for the last chunk. // Do this in two parts, to fallback to the byte by byte loop if size < 32, and to the // single copy loop if size < 64. sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbnz(TMP1, 63, &AgainInternal128Exit); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbnz(TMP1, 63, &AgainInternal256Exit); (void)Bind(&AgainInternal256); MemCpy(32, 32 * Direction); MemCpy(32, 32 * Direction); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); (void)tbz(TMP1, 63, &AgainInternal256); (void)Bind(&AgainInternal256Exit); add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbnz(TMP1, 63, &AgainInternal128Exit); (void)Bind(&AgainInternal128); MemCpy(32, 32 * Direction); sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)tbz(TMP1, 63, &AgainInternal128); (void)Bind(&AgainInternal128Exit); add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); (void)cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); if (IsBackwards) { add(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); add(ARMEmitter::Size::i64Bit, TMP3, TMP3, 32 - Size); } } (void)Bind(&AgainInternal); if (IsAtomic) { MemCpyTSO(Size, SizeDirection); } else { MemCpy(Size, SizeDirection); } sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1); (void)cbnz(ARMEmitter::Size::i64Bit, TMP1, &AgainInternal); (void)Bind(&DoneInternal); // Needs to use temporaries just in case of overwrite mov(TMP1, MemRegDest.X()); mov(TMP2, MemRegSrc.X()); mov(TMP3, Length.X()); FinalizeAddresses(); }; if (DirectionIsInline) { LOGMAN_THROW_A_FMT(DirectionConstant == 1 || DirectionConstant == -1, "unexpected direction"); EmitMemcpy(DirectionConstant); } else { // Emit forward direction memcpy then backward direction memcpy. for (int32_t Direction : {1, -1}) { EmitMemcpy(Direction); if (Direction == 1) { (void)b(&Done); (void)Bind(&BackwardImpl); } } (void)Bind(&Done); // Destination already set to the final pointer. } } DEF_OP(CacheLineClear) { if (!CTX->HostFeatures.SupportsCacheMaintenanceOps) { dmb(ARMEmitter::BarrierScope::SY); return; } auto Op = IROp->C(); auto MemReg = GetReg(Op->Addr); // Clear dcache only // icache doesn't matter here since the guest application shouldn't be calling clflush on JIT code. if (CTX->HostFeatures.DCacheLineSize >= 64U) { dc(ARMEmitter::DataCacheOperation::CIVAC, MemReg); } else { auto CurrentWorkingReg = MemReg.X(); for (size_t i = 0; i < std::max(1U, CTX->HostFeatures.DCacheLineSize / 64U); ++i) { dc(ARMEmitter::DataCacheOperation::CIVAC, TMP1); add(ARMEmitter::Size::i64Bit, TMP1, CurrentWorkingReg, CTX->HostFeatures.DCacheLineSize); CurrentWorkingReg = TMP1; } } if (Op->Serialize) { // If requested, serialized all of the data cache operations. dsb(ARMEmitter::BarrierScope::ISH); } } DEF_OP(CacheLineClean) { if (!CTX->HostFeatures.SupportsCacheMaintenanceOps) { dmb(ARMEmitter::BarrierScope::ST); return; } auto Op = IROp->C(); auto MemReg = GetReg(Op->Addr); // Clean dcache only if (CTX->HostFeatures.DCacheLineSize >= 64U) { dc(ARMEmitter::DataCacheOperation::CVAC, MemReg); } else { auto CurrentWorkingReg = MemReg.X(); for (size_t i = 0; i < std::max(1U, CTX->HostFeatures.DCacheLineSize / 64U); ++i) { dc(ARMEmitter::DataCacheOperation::CVAC, TMP1); add(ARMEmitter::Size::i64Bit, TMP1, CurrentWorkingReg, CTX->HostFeatures.DCacheLineSize); CurrentWorkingReg = TMP1; } } } DEF_OP(CacheLineZero) { auto Op = IROp->C(); auto MemReg = GetReg(Op->Addr); if (CTX->HostFeatures.SupportsCLZERO) { // We can use this instruction directly dc(ARMEmitter::DataCacheOperation::ZVA, MemReg); } else { // We must walk the cacheline ourselves // Force cacheline alignment and_(ARMEmitter::Size::i64Bit, TMP1, MemReg, ~(CPUIDEmu::CACHELINE_SIZE - 1)); // This will end up being four STPs // Depending on uarch it could be slightly more efficient in instructions emitted // and uops to use vector pair STP, but we want the non-temporal bit specifically here for (size_t i = 0; i < CPUIDEmu::CACHELINE_SIZE; i += 16) { stnp(ARMEmitter::XReg::zr, ARMEmitter::XReg::zr, TMP1, i); } } } DEF_OP(Prefetch) { auto Op = IROp->C(); const auto MemReg = GetReg(Op->Addr); // Access size is only ever handled as 8-byte. Even though it is accesssed as a cacheline. const auto MemSrc = GenerateMemOperand(IR::OpSize::i64Bit, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale); size_t LUT = (Op->Stream ? 1 : 0) | ((Op->CacheLevel - 1) << 1) | (Op->ForStore ? 1U << 3 : 0); constexpr static std::array PrefetchType = { ARMEmitter::Prefetch::PLDL1KEEP, ARMEmitter::Prefetch::PLDL1STRM, ARMEmitter::Prefetch::PLDL2KEEP, ARMEmitter::Prefetch::PLDL2STRM, ARMEmitter::Prefetch::PLDL3KEEP, ARMEmitter::Prefetch::PLDL3STRM, // Gap of two. // 0b0'11'0 ARMEmitter::Prefetch::PLDL1STRM, // 0b0'11'1 ARMEmitter::Prefetch::PLDL1STRM, ARMEmitter::Prefetch::PSTL1KEEP, ARMEmitter::Prefetch::PSTL1STRM, ARMEmitter::Prefetch::PSTL2KEEP, ARMEmitter::Prefetch::PSTL2STRM, ARMEmitter::Prefetch::PSTL3KEEP, ARMEmitter::Prefetch::PSTL3STRM, }; prfm(PrefetchType[LUT], MemSrc); } DEF_OP(VStoreNonTemporal) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Value = GetVReg(Op->Value); const auto MemReg = GetReg(Op->Addr); const auto Offset = Op->Offset; if (Is256Bit) { const auto GoverningPredicate = PRED_TMP_32B.Zeroing(); const auto OffsetScaled = Offset / 32; stnt1b(Value.Z(), GoverningPredicate, MemReg, OffsetScaled); } else if (Is128Bit && HostSupportsSVE128) { const auto GoverningPredicate = PRED_TMP_16B.Zeroing(); const auto OffsetScaled = Offset / 16; stnt1b(Value.Z(), GoverningPredicate, MemReg, OffsetScaled); } else { // Treat the non-temporal store as a regular vector store in this case for compatibility str(Value.Q(), MemReg, Offset); } } DEF_OP(VStoreNonTemporalPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is128Bit = OpSize == IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(Is128Bit, "This IR operation only operates at 128-bit wide"); const auto ValueLow = GetVReg(Op->ValueLow); const auto ValueHigh = GetVReg(Op->ValueHigh); const auto MemReg = GetReg(Op->Addr); const auto Offset = Op->Offset; stnp(ValueLow.Q(), ValueHigh.Q(), MemReg, Offset); } DEF_OP(VLoadNonTemporal) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto MemReg = GetReg(Op->Addr); const auto Offset = Op->Offset; if (Is256Bit) { const auto GoverningPredicate = PRED_TMP_32B.Zeroing(); const auto OffsetScaled = Offset / 32; ldnt1b(Dst.Z(), GoverningPredicate, MemReg, OffsetScaled); } else if (Is128Bit && HostSupportsSVE128) { const auto GoverningPredicate = PRED_TMP_16B.Zeroing(); const auto OffsetScaled = Offset / 16; ldnt1b(Dst.Z(), GoverningPredicate, MemReg, OffsetScaled); } else { // Treat the non-temporal store as a regular vector store in this case for compatibility ldr(Dst.Q(), MemReg, Offset); } } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/MiscOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #ifndef _WIN32 #include #endif #include "Interface/Context/Context.h" #include "Interface/Core/JIT/DebugData.h" #include "Interface/Core/JIT/JITClass.h" #include #include #include namespace FEXCore::CPU { DEF_OP(WFET) { auto Op = IROp->C(); const auto Lower = GetReg(Op->Lower); const auto Upper = GetReg(Op->Upper); // Combine registers. mov(ARMEmitter::Size::i64Bit, TMP1, Lower); bfi(ARMEmitter::Size::i64Bit, TMP1, Upper, 32, 32); if (CTX->Config.TSCScale) { // Scale back to ARM64 TSC scale if necessary lsr(ARMEmitter::Size::i64Bit, TMP1, TMP1, CTX->Config.TSCScale); } // Clear the exclusive monitor so it can't spuriously wake up with that event. clrex(); // Execute wfet to wait until the TSC. wfet(TMP1); } DEF_OP(GuestOpcode) { auto Op = IROp->C(); // metadata DebugData->GuestOpcodes.push_back({Op->GuestEntryOffset, GetCursorAddress() - CodeData.BlockBegin}); } DEF_OP(Fence) { auto Op = IROp->C(); switch (Op->Fence) { case IR::FenceType::Load: dmb(ARMEmitter::BarrierScope::LD); break; case IR::FenceType::LoadStore: dmb(ARMEmitter::BarrierScope::SY); break; case IR::FenceType::Store: dmb(ARMEmitter::BarrierScope::ST); break; case IR::FenceType::Inst: isb(); break; default: LOGMAN_MSG_A_FMT("Unknown Fence: {}", Op->Fence); break; } } DEF_OP(Break) { auto Op = IROp->C(); // First we must reset the stack ResetStack(); Core::CpuStateFrame::SynchronousFaultDataStruct State = { .FaultToTopAndGeneratedException = 1, .Signal = Op->Reason.Signal, .TrapNo = Op->Reason.TrapNumber, .si_code = Op->Reason.si_code, .err_code = Op->Reason.ErrorRegister, }; uint64_t Constant {}; memcpy(&Constant, &State, sizeof(State)); LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, Constant); str(ARMEmitter::XReg::x1, STATE, offsetof(FEXCore::Core::CpuStateFrame, SynchronousFaultData)); switch (Op->Reason.Signal) { case Core::FAULT_SIGILL: ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.GuestSignal_SIGILL)); br(TMP1); break; case Core::FAULT_SIGTRAP: ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.GuestSignal_SIGTRAP)); br(TMP1); break; case Core::FAULT_SIGSEGV: ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.GuestSignal_SIGSEGV)); br(TMP1); break; default: ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.GuestSignal_SIGTRAP)); br(TMP1); break; } } DEF_OP(GetRoundingMode) { auto Dst = GetReg(Node); mrs(Dst, ARMEmitter::SystemRegister::FPCR); ubfx(ARMEmitter::Size::i64Bit, Dst, Dst, 22, 3); // FTZ is already in the correct location // Rounding mode is different // // Need to remap rounding mode from order nearest, pos inf, neg inf, toward // zero. Just swapping 01 and 10. That's a bitfield reverse. Round mode is in // bottom two bits. After reversing as a 32-bit operation, it'll be in [31:30] // and ripe for reinsertion back at 0. static_assert(FEXCore::ToUnderlying(IR::RoundMode::Nearest) == 0); static_assert(FEXCore::ToUnderlying(IR::RoundMode::NegInfinity) == 1); static_assert(FEXCore::ToUnderlying(IR::RoundMode::PosInfinity) == 2); static_assert(FEXCore::ToUnderlying(IR::RoundMode::TowardsZero) == 3); rbit(ARMEmitter::Size::i32Bit, TMP1, Dst); bfi(ARMEmitter::Size::i64Bit, Dst, TMP1, 30, 2); } DEF_OP(SetRoundingMode) { auto Op = IROp->C(); auto Src = GetReg(Op->RoundMode); auto MXCSR = GetReg(Op->MXCSR); // As above, setup the rounding flags in [31:30] rbit(ARMEmitter::Size::i32Bit, TMP2, Src); // and extract lsr(ARMEmitter::Size::i32Bit, TMP2, TMP2, 30); mrs(TMP1, ARMEmitter::SystemRegister::FPCR); // vixl simulator doesn't support anything beyond ties-to-even rounding if (CTX->Config.DisableVixlIndirectCalls) [[likely]] { // Insert the rounding flags bfi(ARMEmitter::Size::i64Bit, TMP1, TMP2, 22, 2); } // Insert the FTZ flag lsr(ARMEmitter::Size::i64Bit, TMP2, Src, 2); bfi(ARMEmitter::Size::i64Bit, TMP1, TMP2, 24, 1); if (Op->SetDAZ && HostSupportsAFP) { // Extract DAZ from MXCSR and insert to in FPCR.FIZ bfxil(ARMEmitter::Size::i64Bit, TMP1, MXCSR, 6, 1); } // Now save the new FPCR msr(ARMEmitter::SystemRegister::FPCR, TMP1); } DEF_OP(PushRoundingMode) { auto Op = IROp->C(); auto Dest = GetReg(Node); // Save the old rounding mode mrs(Dest, ARMEmitter::SystemRegister::FPCR); // vixl simulator doesn't support anything beyond ties-to-even rounding if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { return; } // Insert the rounding flags, reversing the mode bits as above if (Op->RoundMode == 3) { orr(ARMEmitter::Size::i64Bit, TMP1, Dest, 3 << 22); } else if (Op->RoundMode == 0) { and_(ARMEmitter::Size::i64Bit, TMP1, Dest, ~(3 << 22)); } else { LOGMAN_THROW_A_FMT(Op->RoundMode == 1 || Op->RoundMode == 2, "expect a valid round mode"); and_(ARMEmitter::Size::i64Bit, TMP1, Dest, ~(Op->RoundMode << 22)); orr(ARMEmitter::Size::i64Bit, TMP1, TMP1, (Op->RoundMode == 2 ? 1 : 2) << 22); } // Now save the new FPCR msr(ARMEmitter::SystemRegister::FPCR, TMP1); } DEF_OP(PopRoundingMode) { auto Op = IROp->C(); msr(ARMEmitter::SystemRegister::FPCR, GetReg(Op->FPCR)); } DEF_OP(Print) { auto Op = IROp->C(); PushDynamicRegs(TMP1); SpillStaticRegs(TMP1); if (IsGPR(Op->Value)) { mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, GetReg(Op->Value)); ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.PrintValue)); } else { fmov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, GetVReg(Op->Value), false); fmov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, GetVReg(Op->Value), true); ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.PrintVectorValue)); } if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { if (IsGPR(Op->Value)) { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); } else { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); } } else { blr(ARMEmitter::Reg::r3); } FillStaticRegs(); PopDynamicRegs(); } DEF_OP(ProcessorID) { if (CTX->HostFeatures.SupportsCPUIndexInTPIDRRO) { mrs(GetReg(Node), ARMEmitter::SystemRegister::TPIDRRO_EL0); return; } #ifdef _WIN32 else { // If on Windows and TPIDRRO isn't supported (like in wine), then this is a programming error. ERROR_AND_DIE_FMT("Unsupported"); } #else // We always need to spill x8 since we can't know if it is live at this SSA location uint32_t SpillMask = 1U << 8; // Ordering is incredibly important here // We must spill any overlapping registers first THEN claim we are in a syscall without invalidating state at all // Only spill the registers that intersect with our usage SpillStaticRegs(TMP1, { .GPRSpillMask = SpillMask, .FPRs = false, }); // Now that we are spilled, store in the state that we are in a syscall // Still without overwriting registers that matter // 16bit LoadConstant to be a single instruction // We must always spill at least one register (x8) so this value always has a bit set // This gives the signal handler a value to check to see if we are in a syscall at all LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, SpillMask & 0xFFFF); str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, InSyscallInfo)); // Allocate some temporary space for storing the uint32_t CPU and Node IDs sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, 16); // Load the getcpu syscall number #if defined(ARCHITECTURE_x86_64) // Just to ensure the syscall number doesn't change if compiled for an x86_64 host. constexpr auto GetCPUSyscallNum = 0xa8; #else constexpr auto GetCPUSyscallNum = SYS_getcpu; #endif LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r8, GetCPUSyscallNum); // CPU pointer in x0 add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, ARMEmitter::Reg::rsp, 0); // Node in x1 add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, ARMEmitter::Reg::rsp, 4); svc(0); // On updated signal mask we can receive a signal RIGHT HERE // Load the values returned by the kernel ldp(ARMEmitter::WReg::w0, ARMEmitter::WReg::w1, ARMEmitter::Reg::rsp); // Deallocate stack space sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, 16); // Now that we are done in the syscall we need to carefully peel back the state // First unspill the registers from before FillStaticRegs({ .OptionalReg = ARMEmitter::Reg::r8, .OptionalReg2 = ARMEmitter::Reg::r2, .GPRFillMask = SpillMask, .FPRs = false, }); // Now the registers we've spilled are back in their original host registers // We can safely claim we are no longer in a syscall str(ARMEmitter::XReg::zr, STATE, offsetof(FEXCore::Core::CpuStateFrame, InSyscallInfo)); // Now store the result in the destination in the expected format // uint32_t Res = (node << 12) | cpu; // CPU is in w0 // Node is in w1 orr(ARMEmitter::Size::i64Bit, GetReg(Node), ARMEmitter::Reg::r0, ARMEmitter::Reg::r1, ARMEmitter::ShiftType::LSL, 12); #endif } DEF_OP(RDRAND) { auto Op = IROp->C(); mrs(GetReg(Node), Op->GetReseeded ? ARMEmitter::SystemRegister::RNDRRS : ARMEmitter::SystemRegister::RNDR); } DEF_OP(Yield) { yield(); } DEF_OP(MonoBackpatcherWrite) { auto Op = IROp->C(); mov(ARMEmitter::Size::i64Bit, TMP3, GetReg(Op->Addr)); mov(ARMEmitter::Size::i64Bit, TMP4, GetReg(Op->Value)); PushDynamicRegs(TMP1); SpillStaticRegs(TMP1); mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, STATE.R()); mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, IR::OpSizeToSize(Op->Size)); if (!TMP_ABIARGS) { mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, TMP3); mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, TMP4); } #ifdef ARCHITECTURE_arm64ec ldr(TMP2, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET); LoadConstant(ARMEmitter::Size::i32Bit, TMP1, 1); strb(TMP1.W(), TMP2, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET); #endif ldr(ARMEmitter::XReg::x4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.MonoBackpatcherWrite)); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r4); } else { blr(ARMEmitter::Reg::r4); } #ifdef ARCHITECTURE_arm64ec ldr(TMP2, ARMEmitter::XReg::x18, TEB_CPU_AREA_OFFSET); strb(ARMEmitter::WReg::zr, TMP2, CPU_AREA_IN_SYSCALL_CALLBACK_OFFSET); #endif FillStaticRegs(); PopDynamicRegs(); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/MoveOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "Interface/Core/JIT/JITClass.h" namespace FEXCore::CPU { DEF_OP(Copy) { auto Op = IROp->C(); mov(ARMEmitter::Size::i64Bit, GetReg(Node), GetReg(Op->Source)); } DEF_OP(RMWHandle) { mov(ARMEmitter::Size::i64Bit, GetReg(Node), GetReg(IROp->Args[0])); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/Relocations.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include namespace FEXCore::Context { class ContextImpl; } namespace FEXCore::CPU { enum class RelocationTypes : uint32_t { // 8 byte literal in memory for symbol // Aligned to struct RelocNamedSymbolLiteral RELOC_NAMED_SYMBOL_LITERAL, // Fixed size named thunk move // 4 instruction constant generation // Aligned to struct RelocNamedThunkMove RELOC_NAMED_THUNK_MOVE, // 8 byte literal (relative to binary base address) RELOC_GUEST_RIP_LITERAL, // Fixed size guest RIP move // 4 instruction constant generation // Aligned to struct RelocGuestRIP RELOC_GUEST_RIP_MOVE, }; struct FEX_PACKED RelocationHeader final { // Offset to the relocated host code data uint64_t Offset {}; RelocationTypes Type; }; struct RelocNamedSymbolLiteral final { enum class NamedSymbol : uint32_t { ///< Thread specific relocations // JIT Literal pointers SYMBOL_LITERAL_EXITFUNCTION_LINKER, }; RelocationHeader Header {}; NamedSymbol Symbol; uint32_t Pad[8]; }; struct RelocNamedThunkMove final { RelocationHeader Header {}; // GPR index the constant is being moved to uint32_t RegisterIndex; // The thunk SHA256 hash IR::SHA256Sum Symbol; }; struct RelocGuestRIP final { RelocationHeader Header {}; // GPR index the constant is being moved to (for non-literal relocations) uint8_t RegisterIndex; char Pad[3]; // The base RIP (to be moved by the register for non-literal relocations). // In a serialized code cache, this is relative to the binary base address. uint64_t GuestRIP; uint32_t pad2[6] {}; }; union Relocation { // Clang 16 Can't default-initialize this union static Relocation Default() { #if __clang_major__ < 17 Relocation Ret {.Header {}}; memset(&Ret, 0, sizeof(Ret)); return Ret; #else return {}; #endif } RelocationHeader Header {}; RelocNamedSymbolLiteral NamedSymbolLiteral; // This makes our union of relocations at least 48 bytes // It might be more efficient to not use a union RelocNamedThunkMove NamedThunkMove; RelocGuestRIP GuestRIP; }; uint64_t GetNamedSymbolLiteral(FEXCore::Context::ContextImpl&, RelocNamedSymbolLiteral::NamedSymbol); } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/JIT/VectorOps.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: backend|arm64 $end_info$ */ #include "Interface/Core/Dispatcher/Dispatcher.h" #include "Interface/Core/JIT/JITClass.h" #include namespace FEXCore::CPU { #define DEF_UNOP(FEXOp, ARMOp, ScalarCase) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ \ const auto ElementSize = Op->Header.ElementSize; \ const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ const auto SubRegSize = ConvertSubRegSize8(IROp); \ \ const auto Dst = GetVReg(Node); \ const auto Src = GetVReg(Op->Vector); \ \ if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(SubRegSize, Dst.Z(), PRED_TMP_32B.Merging(), Src.Z()); \ } else { \ if (ElementSize == OpSize && ScalarCase) { \ ARMOp(SubRegSize, Dst.D(), Src.D()); \ } else { \ ARMOp(SubRegSize, Dst.Q(), Src.Q()); \ } \ } \ } #define DEF_BITOP(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ const auto Is128Bit = OpSize == IR::OpSize::i128Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ \ const auto Dst = GetVReg(Node); \ const auto Vector1 = GetVReg(Op->Vector1); \ const auto Vector2 = GetVReg(Op->Vector2); \ \ if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(Dst.Z(), Vector1.Z(), Vector2.Z()); \ } else if (Is128Bit) { \ ARMOp(Dst.Q(), Vector1.Q(), Vector2.Q()); \ } else { \ ARMOp(Dst.D(), Vector1.D(), Vector2.D()); \ } \ } #define DEF_BINOP(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ \ const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ const auto SubRegSize = ConvertSubRegSize8(IROp); \ \ const auto Dst = GetVReg(Node); \ const auto Vector1 = GetVReg(Op->Vector1); \ const auto Vector2 = GetVReg(Op->Vector2); \ \ if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); \ } else { \ ARMOp(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); \ } \ } #define DEF_ZIPOP(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ \ const auto SubRegSize = ConvertSubRegSize8(IROp); \ const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ \ const auto Dst = GetVReg(Node); \ const auto VectorLower = GetVReg(Op->VectorLower); \ const auto VectorUpper = GetVReg(Op->VectorUpper); \ \ if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(SubRegSize, Dst.Z(), VectorLower.Z(), VectorUpper.Z()); \ } else { \ if (OpSize == IR::OpSize::i64Bit) { \ ARMOp(SubRegSize, Dst.D(), VectorLower.D(), VectorUpper.D()); \ } else { \ ARMOp(SubRegSize, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); \ } \ } \ } #define DEF_FUNOP(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ \ const auto ElementSize = Op->Header.ElementSize; \ const auto SubRegSize = ConvertSubRegSize248(IROp); \ const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ \ const auto Dst = GetVReg(Node); \ const auto Src = GetVReg(Op->Vector); \ \ if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(SubRegSize, Dst.Z(), PRED_TMP_32B.Merging(), Src.Z()); \ } else { \ if (ElementSize == OpSize) { \ switch (ElementSize) { \ case IR::OpSize::i16Bit: { \ ARMOp(Dst.H(), Src.H()); \ break; \ } \ case IR::OpSize::i32Bit: { \ ARMOp(Dst.S(), Src.S()); \ break; \ } \ case IR::OpSize::i64Bit: { \ ARMOp(Dst.D(), Src.D()); \ break; \ } \ default: break; \ } \ } else { \ ARMOp(SubRegSize, Dst.Q(), Src.Q()); \ } \ } \ } #define DEF_FBINOP(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto OpSize = IROp->Size; \ \ const auto ElementSize = Op->Header.ElementSize; \ const auto SubRegSize = ConvertSubRegSize248(IROp); \ const auto Is256Bit = OpSize == IR::OpSize::i256Bit; \ LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); \ const auto IsScalar = ElementSize == OpSize; \ \ const auto Dst = GetVReg(Node); \ const auto Vector1 = GetVReg(Op->Vector1); \ const auto Vector2 = GetVReg(Op->Vector2); \ \ if (HostSupportsSVE256 && Is256Bit) { \ ARMOp(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); \ } else { \ if (IsScalar) { \ switch (ElementSize) { \ case IR::OpSize::i16Bit: { \ ARMOp(Dst.H(), Vector1.H(), Vector2.H()); \ break; \ } \ case IR::OpSize::i32Bit: { \ ARMOp(Dst.S(), Vector1.S(), Vector2.S()); \ break; \ } \ case IR::OpSize::i64Bit: { \ ARMOp(Dst.D(), Vector1.D(), Vector2.D()); \ break; \ } \ default: break; \ } \ } else { \ ARMOp(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); \ } \ } \ } #define DEF_FBINOP_SCALAR_INSERT(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto ElementSize = Op->Header.ElementSize; \ const auto SubRegSize = ConvertSubRegSizePair248(IROp); \ \ auto ScalarEmit = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { \ ARMOp(SubRegSize.Scalar, Dst, Src1, Src2); \ }; \ \ const auto Dst = GetVReg(Node); \ const auto Vector1 = GetVReg(Op->Vector1); \ const auto Vector2 = GetVReg(Op->Vector2); \ \ VFScalarOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); \ } #define DEF_FMAOP_SCALAR_INSERT(FEXOp, ARMOp) \ DEF_OP(FEXOp) { \ const auto Op = IROp->C(); \ const auto ElementSize = Op->Header.ElementSize; \ \ auto ScalarEmit = [this, ElementSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2, \ ARMEmitter::VRegister Src3) { \ if (ElementSize == IR::OpSize::i16Bit) { \ ARMOp(Dst.H(), Src1.H(), Src2.H(), Src3.H()); \ } else if (ElementSize == IR::OpSize::i32Bit) { \ ARMOp(Dst.S(), Src1.S(), Src2.S(), Src3.S()); \ } else if (ElementSize == IR::OpSize::i64Bit) { \ ARMOp(Dst.D(), Src1.D(), Src2.D(), Src3.D()); \ } \ }; \ \ const auto Dst = GetVReg(Node); \ const auto Upper = GetVReg(Op->Upper); \ const auto Vector1 = GetVReg(Op->Vector1); \ const auto Vector2 = GetVReg(Op->Vector2); \ const auto Addend = GetVReg(Op->Addend); \ \ VFScalarFMAOperation(IROp->Size, ElementSize, ScalarEmit, Dst, Upper, Vector1, Vector2, Addend); \ } DEF_UNOP(VAbs, abs, true) DEF_UNOP(VPopcount, cnt, true) DEF_UNOP(VNeg, neg, false) DEF_UNOP(VFNeg, fneg, false) DEF_BITOP(VAnd, and_) DEF_BITOP(VAndn, bic) DEF_BITOP(VOr, orr) DEF_BITOP(VXor, eor) DEF_BINOP(VAdd, add) DEF_BINOP(VSub, sub) DEF_BINOP(VUQAdd, uqadd) DEF_BINOP(VUQSub, uqsub) DEF_BINOP(VSQAdd, sqadd) DEF_BINOP(VSQSub, sqsub) DEF_ZIPOP(VZip, zip1) DEF_ZIPOP(VZip2, zip2) DEF_ZIPOP(VUnZip, uzp1) DEF_ZIPOP(VUnZip2, uzp2) DEF_ZIPOP(VTrn, trn1) DEF_ZIPOP(VTrn2, trn2) DEF_FUNOP(VFSqrt, fsqrt) DEF_FUNOP(VFAbs, fabs) DEF_FBINOP(VFAdd, fadd) DEF_FBINOP(VFSub, fsub) DEF_FBINOP(VFMul, fmul) DEF_FBINOP_SCALAR_INSERT(VFAddScalarInsert, fadd) DEF_FBINOP_SCALAR_INSERT(VFSubScalarInsert, fsub) DEF_FBINOP_SCALAR_INSERT(VFMulScalarInsert, fmul) DEF_FBINOP_SCALAR_INSERT(VFDivScalarInsert, fdiv) DEF_FMAOP_SCALAR_INSERT(VFMLAScalarInsert, fmadd) DEF_FMAOP_SCALAR_INSERT(VFMLSScalarInsert, fnmsub) DEF_FMAOP_SCALAR_INSERT(VFNMLAScalarInsert, fmsub) DEF_FMAOP_SCALAR_INSERT(VFNMLSScalarInsert, fnmadd) void Arm64JITCore::VFScalarFMAOperation(IR::OpSize OpSize, IR::OpSize ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Upper, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2, ARMEmitter::VRegister Addend) { LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit, "256-bit unsupported", __func__); LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid " "size"); const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : ARMEmitter::SubRegSize::i64Bit); if (Dst != Upper) { // If destination is not tied, move the upper bits to the destination first. mov(Dst.Q(), Upper.Q()); } if (HostSupportsAFP && Dst == Addend) { ///< Exactly matches ARM scalar FMA semantics // If the host CPU supports AFP then scalar does an insert without modifying upper bits. ScalarEmit(Dst, Vector1, Vector2, Addend); } else { // Host doesn't support AFP, need to emit in to a temporary then insert. ScalarEmit(VTMP1, Vector1, Vector2, Addend); ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } // VFScalarOperation performs the operation described through ScalarEmit between Vector1 and Vector2, // storing it into Dst. This is a scalar operation, so the only lowest element of each vector is used for the operation. // The result is stored into the destination. The untouched bits of the destination come from Vector1, unless it's a 256 vector // and ZeroUpperBits is true, in which case the upper bits are zero. void Arm64JITCore::VFScalarOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2) { const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__); // Bit of a tricky detail. // The upper bits of the destination comes from Vector1. LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid " "size"); const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : ARMEmitter::SubRegSize::i64Bit); constexpr auto Predicate = ARMEmitter::PReg::p0; if (Dst == Vector1) { if (ZeroUpperBits) { // When zeroing the upper 128-bits we just use an ASIMD move. mov(Dst.Q(), Vector1.Q()); } if (HostSupportsAFP) { // If the host CPU supports AFP then scalar does an insert without modifying upper bits. ScalarEmit(Dst, Vector1, Vector2); } else { // If AFP is unsupported then the operation result goes in to a temporary. // and then it gets inserted. ScalarEmit(VTMP1, Vector1, Vector2); if (!ZeroUpperBits && Is256Bit) { ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } } else if (Dst != Vector2) { // Dst different from both Vector1 and Vector2 if (Is256Bit && !ZeroUpperBits) { mov(Dst.Z(), Vector1.Z()); } else { mov(Dst.Q(), Vector1.Q()); } if (HostSupportsAFP) { ScalarEmit(Dst, Vector1, Vector2); } else { ScalarEmit(VTMP1, Vector1, Vector2); if (!ZeroUpperBits && Is256Bit) { ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } } else { // Dst same as Vector2 ScalarEmit(VTMP1, Vector1, Vector2); if (!ZeroUpperBits && Is256Bit) { mov(Dst.Z(), Vector1.Z()); ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { mov(Dst.Q(), Vector1.Q()); ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } } // Similarly to VFScalarOperation it performs the operation described through ScalarEmit operating on Vector2. // However the result of the scalar operation is inserted into Vector1 and moved to Destination. // The untouched bits of the destination come from Vector1, unless it's a 256 vector // and ZeroUpperBits is true, in which case the upper bits are zero. void Arm64JITCore::VFScalarUnaryOperation(IR::OpSize OpSize, IR::OpSize ElementSize, bool ZeroUpperBits, ScalarUnaryOpCaller ScalarEmit, ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, std::variant Vector2) { const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); LOGMAN_THROW_A_FMT(Is256Bit || !ZeroUpperBits, "128-bit operation doesn't support ZeroUpperBits in {}", __func__); LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i16Bit || ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid " "size"); const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i16Bit : ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i32Bit : ARMEmitter::SubRegSize::i64Bit); constexpr auto Predicate = ARMEmitter::PReg::p0; bool DstOverlapsVector2 = false; if (const auto* Vector2Reg = std::get_if(&Vector2)) { DstOverlapsVector2 = Dst == *Vector2Reg; } if (Dst == Vector1) { if (ZeroUpperBits) { // When zeroing the upper 128-bits we just use an ASIMD move. mov(Dst.Q(), Vector1.Q()); } if (HostSupportsAFP) { // or Dst (here Dst == Vector1) // If the host CPU supports AFP then scalar does an insert without modifying upper bits. ScalarEmit(Dst, Vector2); } else { // If AFP is unsupported then the operation result goes in to a temporary. // and then it gets inserted. ScalarEmit(VTMP1, Vector2); if (!ZeroUpperBits && Is256Bit) { ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } } else if (!DstOverlapsVector2) { if (!ZeroUpperBits && Is256Bit) { mov(Dst.Z(), Vector1.Z()); } else { mov(Dst.Q(), Vector1.Q()); } if (HostSupportsAFP) { ScalarEmit(Dst, Vector2); } else { ScalarEmit(VTMP1, Vector2); if (!ZeroUpperBits && Is256Bit) { ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } } else { // Destination intersects Vector2, can't do anything optimal in this case. // Do the scalar operation first and then move and insert. ScalarEmit(VTMP1, Vector2); if (!ZeroUpperBits && Is256Bit) { mov(Dst.Z(), Vector1.Z()); ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { mov(Dst.Q(), Vector1.Q()); ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } } DEF_OP(VFMinScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); auto ScalarEmit = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { if (HostSupportsAFP) { // AFP.AH lets fmin behave like x86 min fmin(SubRegSize.Scalar, Dst, Src1, Src2); } else { // Only take the first operand if it is strictly less. Otherwise take // the second. This emulates all the weird x86 rules for signed zero and // NaNs. No, they're not IEEE-754 semantics. fcmp(SubRegSize.Scalar, Src1, Src2); fcsel(SubRegSize.Scalar, Dst, Src1, Src2, ARMEmitter::Condition::CC_MI); } }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); } DEF_OP(VFMaxScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); // AFP can make this more optimal. auto ScalarEmit = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { if (HostSupportsAFP) { // AFP.AH lets fmax behave like x86 max fmax(SubRegSize.Scalar, Dst, Src1, Src2); } else { // Only take the first operand if it is strictly greater. See fmin. fcmp(SubRegSize.Scalar, Src1, Src2); fcsel(SubRegSize.Scalar, Dst, Src1, Src2, ARMEmitter::Condition::CC_GT); } }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); } DEF_OP(VFSqrtScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); auto ScalarEmit = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); fsqrt(SubRegSize.Scalar, Dst, Src); }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); } DEF_OP(VFRSqrtScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); auto ScalarEmit = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f); fsqrt(SubRegSize.Scalar, VTMP2, Src); if (HostSupportsAFP) { fdiv(SubRegSize.Scalar, VTMP1, VTMP1, VTMP2); ins(SubRegSize.Vector, Dst, 0, VTMP1, 0); } else { fdiv(SubRegSize.Scalar, Dst, VTMP1, VTMP2); } }; auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); frsqrte(SubRegSize.Scalar, Dst.D(), Src.D()); }; std::array Handlers = { ScalarEmit, ScalarEmitRPRES, }; const auto HandlerIndex = ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES ? 1 : 0; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, Handlers[HandlerIndex], Dst, Vector1, Vector2); } DEF_OP(VFRecpScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); auto ScalarEmit = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f); if (HostSupportsAFP) { fdiv(SubRegSize.Scalar, VTMP1, VTMP1, Src); ins(SubRegSize.Vector, Dst, 0, VTMP1, 0); } else { fdiv(SubRegSize.Scalar, Dst, VTMP1, Src); } }; auto ScalarEmitRPRES = [this, SubRegSize](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); frecpe(SubRegSize.Scalar, Dst, Src); }; std::array Handlers = { ScalarEmit, ScalarEmitRPRES, }; const auto HandlerIndex = ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES ? 1 : 0; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, Handlers[HandlerIndex], Dst, Vector1, Vector2); } DEF_OP(VFToFScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const uint16_t Conv = (IR::OpSizeToSize(Op->Header.ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto ScalarEmit = [this, Conv](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); switch (Conv) { case 0x0204: { // Half <- Float fcvt(Dst.H(), Src.S()); break; } case 0x0208: { // Half <- Double fcvt(Dst.H(), Src.D()); break; } case 0x0402: { // Float <- Half fcvt(Dst.S(), Src.H()); break; } case 0x0802: { // Double <- Half fcvt(Dst.D(), Src.H()); break; } case 0x0804: { // Double <- Float fcvt(Dst.D(), Src.S()); break; } case 0x0408: { // Float <- Double fcvt(Dst.S(), Src.D()); break; } default: LOGMAN_MSG_A_FMT("Unknown FCVT sizes: 0x{:x}", Conv); } }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); } DEF_OP(VSToFVectorInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto HasTwoElements = Op->HasTwoElements; LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i32Bit || ElementSize == IR::OpSize::i64Bit, "Invalid size"); if (HasTwoElements) { LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i32Bit, "Can't have two elements for 8-byte size"); } auto ScalarEmit = [this, ElementSize, HasTwoElements](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); if (ElementSize == IR::OpSize::i32Bit) { if (HasTwoElements) { scvtf(ARMEmitter::SubRegSize::i32Bit, Dst.D(), Src.D()); } else { scvtf(ARMEmitter::ScalarRegSize::i32Bit, Dst.S(), Src.S()); } } else { scvtf(ARMEmitter::ScalarRegSize::i64Bit, Dst.D(), Src.D()); } }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); // Claim the element size is 8-bytes. // Might be scalar 8-byte (cvtsi2ss xmm0, rax) // Might be vector i32v2 (cvtpi2ps xmm0, mm0) if (!HasTwoElements) { VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); return; } // Dealing with the odd case of this being actually a vector operation rather than scalar. const auto Is256Bit = IROp->Size == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); constexpr auto Predicate = ARMEmitter::PReg::p0; ScalarEmit(VTMP1, Vector2); if (!Op->ZeroUpperBits && Is256Bit) { if (Dst != Vector1) { mov(Dst.Z(), Vector1.Z()); } ptrue(ARMEmitter::SubRegSize::i64Bit, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { if (Dst != Vector1) { mov(Dst.Q(), Vector1.Q()); } ins(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0, VTMP1.Q(), 0); } } DEF_OP(VSToFGPRInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const uint16_t Conv = (IR::OpSizeToSize(ElementSize) << 8) | IR::OpSizeToSize(Op->SrcElementSize); auto ScalarEmit = [this, Conv](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); switch (Conv) { case 0x0204: { // Half <- int32_t scvtf(ARMEmitter::Size::i32Bit, Dst.H(), Src); break; } case 0x0208: { // Half <- int64_t scvtf(ARMEmitter::Size::i64Bit, Dst.H(), Src); break; } case 0x0404: { // Float <- int32_t scvtf(ARMEmitter::Size::i32Bit, Dst.S(), Src); break; } case 0x0408: { // Float <- int64_t scvtf(ARMEmitter::Size::i64Bit, Dst.S(), Src); break; } case 0x0804: { // Double <- int32_t scvtf(ARMEmitter::Size::i32Bit, Dst.D(), Src); break; } case 0x0808: { // Double <- int64_t scvtf(ARMEmitter::Size::i64Bit, Dst.D(), Src); break; } default: LOGMAN_MSG_A_FMT("Unhandled conversion mask: Mask=0x{:04x}", Conv); break; } }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); const auto GPR = GetReg(Op->Src); VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector, GPR); } DEF_OP(VFToIScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto RoundMode = Op->Round; auto ScalarEmit = [this, SubRegSize, RoundMode](ARMEmitter::VRegister Dst, std::variant SrcVar) { auto Src = *std::get_if(&SrcVar); switch (RoundMode) { case IR::RoundMode::Nearest: frintn(SubRegSize.Scalar, Dst, Src); break; case IR::RoundMode::NegInfinity: frintm(SubRegSize.Scalar, Dst, Src); break; case IR::RoundMode::PosInfinity: frintp(SubRegSize.Scalar, Dst, Src); break; case IR::RoundMode::TowardsZero: frintz(SubRegSize.Scalar, Dst, Src); break; case IR::RoundMode::Host: frinti(SubRegSize.Scalar, Dst, Src); break; } }; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarUnaryOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); } DEF_OP(VFCMPScalarInsert) { const auto Op = IROp->C(); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto ZeroUpperBits = Op->ZeroUpperBits; const auto Is256Bit = IROp->Size == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); auto ScalarEmitEQ = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { switch (SubRegSize.Scalar) { case ARMEmitter::ScalarRegSize::i16Bit: { fcmeq(Dst.H(), Src2.H(), Src1.H()); break; } case ARMEmitter::ScalarRegSize::i32Bit: case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Src2, Src1); break; default: break; } }; auto ScalarEmitLT = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { switch (SubRegSize.Scalar) { case ARMEmitter::ScalarRegSize::i16Bit: { fcmgt(Dst.H(), Src2.H(), Src1.H()); break; } case ARMEmitter::ScalarRegSize::i32Bit: case ARMEmitter::ScalarRegSize::i64Bit: fcmgt(SubRegSize.Scalar, Dst, Src2, Src1); break; default: break; } }; auto ScalarEmitLE = [this, SubRegSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { switch (SubRegSize.Scalar) { case ARMEmitter::ScalarRegSize::i16Bit: { fcmge(Dst.H(), Src2.H(), Src1.H()); break; } case ARMEmitter::ScalarRegSize::i32Bit: case ARMEmitter::ScalarRegSize::i64Bit: fcmge(SubRegSize.Scalar, Dst, Src2, Src1); break; default: break; } }; auto ScalarEmitUNO = [this, SubRegSize, ZeroUpperBits, Is256Bit](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { switch (SubRegSize.Scalar) { case ARMEmitter::ScalarRegSize::i16Bit: { fcmge(VTMP1.H(), Src1.H(), Src2.H()); fcmgt(VTMP2.H(), Src2.H(), Src1.H()); break; } case ARMEmitter::ScalarRegSize::i32Bit: case ARMEmitter::ScalarRegSize::i64Bit: fcmge(SubRegSize.Scalar, VTMP1, Src1, Src2); fcmgt(SubRegSize.Scalar, VTMP2, Src2, Src1); break; default: break; } // If the destination is a temporary then it is going to do an insert after the operation. // This means this operation can avoid a redundant insert in this case. const bool DstIsTemp = Dst == VTMP1; // Combine results and invert directly in VTMP1. orr(VTMP1.D(), VTMP1.D(), VTMP2.D()); mvn(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); if (!DstIsTemp) { // If the destination doesn't overlap VTMP1, then we need to insert the final result. // This only happens in the case that the host supports AFP. if (!ZeroUpperBits && Is256Bit) { constexpr auto Predicate = ARMEmitter::PReg::p0; ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } }; auto ScalarEmitNEQ = [this, SubRegSize, ZeroUpperBits, Is256Bit](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { switch (SubRegSize.Scalar) { case ARMEmitter::ScalarRegSize::i16Bit: { fcmeq(VTMP1.H(), Src2.H(), Src1.H()); break; } case ARMEmitter::ScalarRegSize::i32Bit: case ARMEmitter::ScalarRegSize::i64Bit: fcmeq(SubRegSize.Scalar, VTMP1, Src2, Src1); break; default: break; } // If the destination is a temporary then it is going to do an insert after the operation. // This means this operation can avoid a redundant insert in this case. const bool DstIsTemp = Dst == VTMP1; // Invert directly in VTMP1. mvn(ARMEmitter::SubRegSize::i8Bit, VTMP1.D(), VTMP1.D()); if (!DstIsTemp) { // If the destination doesn't overlap VTMP1, then we need to insert the final result. // This only happens in the case that the host supports AFP. if (!ZeroUpperBits && Is256Bit) { constexpr auto Predicate = ARMEmitter::PReg::p0; ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } }; auto ScalarEmitORD = [this, SubRegSize, ZeroUpperBits, Is256Bit](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2) { switch (SubRegSize.Scalar) { case ARMEmitter::ScalarRegSize::i16Bit: { fcmge(VTMP1.H(), Src1.H(), Src2.H()); fcmgt(VTMP2.H(), Src2.H(), Src1.H()); break; } case ARMEmitter::ScalarRegSize::i32Bit: case ARMEmitter::ScalarRegSize::i64Bit: fcmge(SubRegSize.Scalar, VTMP1, Src1, Src2); fcmgt(SubRegSize.Scalar, VTMP2, Src2, Src1); break; default: break; } // If the destination is a temporary then it is going to do an insert after the operation. // This means this operation can avoid a redundant insert in this case. const bool DstIsTemp = Dst == VTMP1; // Combine results directly in VTMP1. orr(VTMP1.D(), VTMP1.D(), VTMP2.D()); if (!DstIsTemp) { // If the destination doesn't overlap VTMP1, then we need to insert the final result. // This only happens in the case that the host supports AFP. if (!ZeroUpperBits && Is256Bit) { constexpr auto Predicate = ARMEmitter::PReg::p0; ptrue(SubRegSize.Vector, Predicate, ARMEmitter::PredicatePattern::SVE_VL1); mov(SubRegSize.Vector, Dst.Z(), Predicate.Merging(), VTMP1.Z()); } else { ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0); } } }; std::array Funcs = {{ ScalarEmitEQ, ScalarEmitLT, ScalarEmitLE, ScalarEmitUNO, ScalarEmitNEQ, ScalarEmitORD, }}; // Bit of a tricky detail. // The upper bits of the destination comes from the first source. const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); VFScalarOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, Funcs[FEXCore::ToUnderlying(Op->Op)], Dst, Vector1, Vector2); } DEF_OP(VectorImm) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Dst = GetVReg(Node); if (HostSupportsSVE256 && Is256Bit) { LOGMAN_THROW_A_FMT(Op->ShiftAmount == 0, "SVE VectorImm doesn't support a shift"); if (ElementSize > IR::OpSize::i8Bit && (Op->Immediate & 0x80)) { // SVE dup uses sign extension where VectorImm wants zext LoadConstant(ARMEmitter::Size::i64Bit, TMP1, Op->Immediate); dup(SubRegSize, Dst.Z(), TMP1); } else { dup_imm(SubRegSize, Dst.Z(), static_cast(Op->Immediate)); } } else { if (ElementSize == IR::OpSize::i64Bit) { // movi with 64bit element size doesn't do what we want here LoadConstant(ARMEmitter::Size::i64Bit, TMP1, static_cast(Op->Immediate) << Op->ShiftAmount); dup(SubRegSize, Dst.Q(), TMP1.R()); } else { movi(SubRegSize, Dst.Q(), Op->Immediate, Op->ShiftAmount); } } } DEF_OP(LoadNamedVectorConstant) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); switch (Op->Constant) { case FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_ZERO: movi(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0); return; default: // Intentionally doing nothing. break; } if (HostSupportsSVE128) { switch (Op->Constant) { case FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_MOVMSKPS_SHIFT: index(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), 0, 1); return; default: // Intentionally doing nothing. break; } } // Load the pointer. auto GenerateMemOperand = [this](IR::OpSize OpSize, uint32_t NamedConstant, ARMEmitter::Register Base) { const auto ConstantOffset = ARRAY_OFFSETOF(FEXCore::Core::CpuStateFrame, Pointers.NamedVectorConstants, NamedConstant); if (ConstantOffset <= 255 || // Unscaled 9-bit signed ((ConstantOffset & (IR::OpSizeToSize(OpSize) - 1)) == 0 && FEXCore::DividePow2(ConstantOffset, IR::OpSizeToSize(OpSize)) <= 4095)) /* 12-bit unsigned scaled */ { return ARMEmitter::ExtendedMemOperand(Base.X(), ARMEmitter::IndexType::OFFSET, ConstantOffset); } ldr(TMP1, STATE_PTR_IDX(CpuStateFrame, Pointers.NamedVectorConstantPointers, NamedConstant)); return ARMEmitter::ExtendedMemOperand(TMP1, ARMEmitter::IndexType::OFFSET, 0); }; if (OpSize == IR::OpSize::i256Bit) { // Handle SVE 32-byte variant upfront. ldr(TMP1, STATE_PTR_IDX(CpuStateFrame, Pointers.NamedVectorConstantPointers, Op->Constant)); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), TMP1, 0); return; } auto MemOperand = GenerateMemOperand(OpSize, Op->Constant, STATE); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, MemOperand); break; case IR::OpSize::i16Bit: ldrh(Dst, MemOperand); break; case IR::OpSize::i32Bit: ldr(Dst.S(), MemOperand); break; case IR::OpSize::i64Bit: ldr(Dst.D(), MemOperand); break; case IR::OpSize::i128Bit: ldr(Dst.Q(), MemOperand); break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, OpSize); break; } } DEF_OP(LoadNamedVectorIndexedConstant) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); // Load the pointer. ldr(TMP1, STATE_PTR_IDX(CpuStateFrame, Pointers.IndexedNamedVectorConstantPointers, Op->Constant)); switch (OpSize) { case IR::OpSize::i8Bit: ldrb(Dst, TMP1, Op->Index); break; case IR::OpSize::i16Bit: ldrh(Dst, TMP1, Op->Index); break; case IR::OpSize::i32Bit: ldr(Dst.S(), TMP1, Op->Index); break; case IR::OpSize::i64Bit: ldr(Dst.D(), TMP1, Op->Index); break; case IR::OpSize::i128Bit: ldr(Dst.Q(), TMP1, Op->Index); break; case IR::OpSize::i256Bit: { add(ARMEmitter::Size::i64Bit, TMP1, TMP1, Op->Index); ld1b(Dst.Z(), PRED_TMP_32B.Zeroing(), TMP1, 0); break; } default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, OpSize); break; } } DEF_OP(VMov) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto Source = GetVReg(Op->Source); switch (OpSize) { case IR::OpSize::i8Bit: { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i8Bit, VTMP1, 0, Source, 0); mov(Dst.Q(), VTMP1.Q()); break; } case IR::OpSize::i16Bit: { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i16Bit, VTMP1, 0, Source, 0); mov(Dst.Q(), VTMP1.Q()); break; } case IR::OpSize::i32Bit: { movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); ins(ARMEmitter::SubRegSize::i32Bit, VTMP1, 0, Source, 0); mov(Dst.Q(), VTMP1.Q()); break; } case IR::OpSize::i64Bit: { mov(Dst.D(), Source.D()); break; } case IR::OpSize::i128Bit: { if (HostSupportsSVE256 || Dst.Idx() != Source.Idx()) { mov(Dst.Q(), Source.Q()); } break; } case IR::OpSize::i256Bit: { // NOTE: If, in the distant future we support larger moves, or registers // (*cough* AVX-512 *cough*) make sure to change this to treat // 256-bit moves with zero extending behavior instead of doing only // a regular SVE move into a 512-bit register. if (Dst.Idx() != Source.Idx()) { mov(Dst.Z(), Source.Z()); } break; } default: LOGMAN_MSG_A_FMT("Unknown Op Size: {}", OpSize); break; } } DEF_OP(VAddP) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto IsScalar = OpSize == IR::OpSize::i64Bit; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); const auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); // SVE ADDP is a destructive operation, so we need a temporary movprfx(VTMP1.Z(), VectorLower.Z()); // Unlike Adv. SIMD's version of ADDP, which acts like it concats the // upper vector onto the end of the lower vector and then performs // pairwise addition, the SVE version actually interleaves the // results of the pairwise addition (gross!), so we need to undo that. addp(SubRegSize, VTMP1.Z(), Pred, VTMP1.Z(), VectorUpper.Z()); uzp1(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP1.Z()); uzp2(SubRegSize, VTMP2.Z(), VTMP1.Z(), VTMP1.Z()); // Merge upper half with lower half. splice(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), PRED_TMP_16B, Dst.Z(), VTMP2.Z()); } else { if (IsScalar) { addp(SubRegSize, Dst.D(), VectorLower.D(), VectorUpper.D()); } else { addp(SubRegSize, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); } } } DEF_OP(VOrn) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; const auto Is128Bit = OpSize == IR::OpSize::i128Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); not_(ARMEmitter::SubRegSize::i8Bit, VTMP1.Z(), Pred, Vector2.Z()); orr(Dst.Z(), Vector1.Z(), VTMP1.Z()); } else if (Is128Bit) { orn(Dst.Q(), Vector1.Q(), Vector2.Q()); } else { orn(Dst.D(), Vector1.D(), Vector2.D()); } } DEF_OP(VFAddV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); LOGMAN_THROW_A_FMT(OpSize == IR::OpSize::i128Bit || OpSize == IR::OpSize::i256Bit, "Only AVX and SSE size supported"); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); faddv(SubRegSize.Vector, Dst, Pred, Vector.Z()); } if (HostSupportsSVE128) { const auto Pred = PRED_TMP_16B.Merging(); faddv(SubRegSize.Vector, Dst, Pred, Vector.Z()); } else { // ASIMD doesn't support faddv, need to use multiple faddp to match behaviour. if (ElementSize == IR::OpSize::i32Bit) { faddp(SubRegSize.Vector, Dst.Q(), Vector.Q(), Vector.Q()); faddp(SubRegSize.Scalar, Dst, Dst); } else { faddp(SubRegSize.Scalar, Dst, Vector); } } } DEF_OP(VAddV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { // SVE doesn't have an equivalent ADDV instruction, so we make do // by performing two Adv. SIMD ADDV operations on the high and low // 128-bit lanes and then sum them up. const auto Mask = PRED_TMP_32B.Zeroing(); const auto CompactPred = ARMEmitter::PReg::p0; // Select all our upper elements to run ADDV over them. not_(CompactPred, Mask, PRED_TMP_16B); compact(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), CompactPred, Vector.Z()); addv(SubRegSize.Vector, VTMP2.Q(), Vector.Q()); addv(SubRegSize.Vector, VTMP1.Q(), VTMP1.Q()); add(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q()); } else { if (ElementSize == IR::OpSize::i64Bit) { addp(SubRegSize.Scalar, Dst, Vector); } else { addv(SubRegSize.Vector, Dst.Q(), Vector.Q()); } } } DEF_OP(VUMinV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B; uminv(SubRegSize, Dst, Pred, Vector.Z()); } else { // Vector uminv(SubRegSize, Dst.Q(), Vector.Q()); } } DEF_OP(VUMaxV) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B; umaxv(SubRegSize, Dst, Pred, Vector.Z()); } else { // Vector umaxv(SubRegSize, Dst.Q(), Vector.Q()); } } DEF_OP(VURAvg) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); // Trivial cases where we already have source data to be averaged in // the destination register. We can just do the operation in place. if (Dst == Vector1) { urhadd(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { urhadd(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector1.Z()); } else { // SVE URHADD is a destructive operation, but we know that // we don't have any source/destination aliasing happening here // so we can safely move one of the source operands into the destination. movprfx(Dst.Z(), Vector1.Z()); urhadd(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } } else { urhadd(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); } } DEF_OP(VFAddP) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); const auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); // SVE FADDP is a destructive operation, so we need a temporary movprfx(VTMP1.Z(), VectorLower.Z()); // Unlike Adv. SIMD's version of FADDP, which acts like it concats the // upper vector onto the end of the lower vector and then performs // pairwise addition, the SVE version actually interleaves the // results of the pairwise addition (gross!), so we need to undo that. faddp(SubRegSize, VTMP1.Z(), Pred, VTMP1.Z(), VectorUpper.Z()); uzp1(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP1.Z()); uzp2(SubRegSize, VTMP2.Z(), VTMP1.Z(), VTMP1.Z()); // Merge upper half with lower half. splice(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), PRED_TMP_16B, Dst.Z(), VTMP2.Z()); } else { faddp(SubRegSize, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); } } DEF_OP(VFDiv) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (Dst == Vector1) { // Trivial case where we already have source data to be divided in the // destination register. We can just divide by Vector2 and be done with it. fdiv(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { // If the destination aliases the second vector, then we need // to use a temp. movprfx(VTMP1.Z(), Vector1.Z()); fdiv(SubRegSize, VTMP1.Z(), Mask, VTMP1.Z(), Vector2.Z()); mov(Dst.Z(), VTMP1.Z()); } else { // If no registers alias the destination, then we can move directly // into the destination and then divide. movprfx(Dst.Z(), Vector1.Z()); fdiv(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fdiv(Dst.H(), Vector1.H(), Vector2.H()); break; } case IR::OpSize::i32Bit: { fdiv(Dst.S(), Vector1.S(), Vector2.S()); break; } case IR::OpSize::i64Bit: { fdiv(Dst.D(), Vector1.D(), Vector2.D()); break; } default: break; } } else { fdiv(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); } } } DEF_OP(VFMin) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); // NOTE: We don't directly use FMIN** here for any of the implementations, // because it has undesirable NaN handling behavior (it sets // entries either to the incoming NaN value*, or the default NaN // depending on FPCR flags set). We want behavior that sets NaN // entries to zero for the comparison result. // // * - Not exactly (differs slightly with SNaNs), but close enough for the explanation // ** - Unless the host supports AFP.AH, which allows FMIN/FMAX to select the second source element as expected of x86. if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B; const auto ComparePred = ARMEmitter::PReg::p0; // General idea: // 1. Compare greater than against the two vectors // 2. Invert the resulting values in the predicate register. // 3. Move the first vector into a temporary // 4. Merge all the elements that correspond to the inverted // predicate bits from the second vector into the // same temporary. // 5. Move temporary into the destination register and we're done. fcmgt(SubRegSize, ComparePred, Mask.Zeroing(), Vector2.Z(), Vector1.Z()); not_(ComparePred, Mask.Zeroing(), ComparePred); if (Dst == Vector1) { // Trivial case where Vector1 is also the destination. // We don't need to move any data around in this case (aside from the merge). mov(SubRegSize, Dst.Z(), ComparePred.Merging(), Vector2.Z()); } else { mov(VTMP1.Z(), Vector1.Z()); mov(SubRegSize, VTMP1.Z(), ComparePred.Merging(), Vector2.Z()); mov(Dst.Z(), VTMP1.Z()); } } else { LOGMAN_THROW_A_FMT(!IsScalar, "should use VFMinScalarInsert instead"); if (HostSupportsAFP) { // AFP.AH lets fmin behave like x86 min fmin(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); return; } if (Dst == Vector1) { // Destination is already Vector1, need to insert Vector2 on false. fcmgt(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); bif(Dst.Q(), Vector2.Q(), VTMP1.Q()); } else if (Dst == Vector2) { // Destination is already Vector2, Invert arguments and insert Vector1 on false. fcmgt(SubRegSize, VTMP1.Q(), Vector1.Q(), Vector2.Q()); bif(Dst.Q(), Vector1.Q(), VTMP1.Q()); } else { // Dst is not either source, need a move. fcmgt(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(Dst.Q(), Vector1.Q()); bif(Dst.Q(), Vector2.Q(), VTMP1.Q()); } } } DEF_OP(VFMax) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); // NOTE: See VFMin implementation for reasons why we // don't just use FMAX/FMIN for these implementations. if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B; const auto ComparePred = ARMEmitter::PReg::p0; fcmgt(SubRegSize, ComparePred, Mask.Zeroing(), Vector2.Z(), Vector1.Z()); if (Dst == Vector1) { // Trivial case where Vector1 is also the destination. // We don't need to move any data around in this case (aside from the merge). mov(SubRegSize, Dst.Z(), ComparePred.Merging(), Vector2.Z()); } else { mov(VTMP1.Z(), Vector1.Z()); mov(SubRegSize, VTMP1.Z(), ComparePred.Merging(), Vector2.Z()); mov(Dst.Z(), VTMP1.Z()); } } else { LOGMAN_THROW_A_FMT(!IsScalar, "should use VFMaxScalarInsert instead"); if (HostSupportsAFP) { // AFP.AH lets fmax behave like x86 max fmax(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); return; } if (Dst == Vector1) { // Destination is already Vector1, need to insert Vector2 on true. fcmgt(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); bit(Dst.Q(), Vector2.Q(), VTMP1.Q()); } else if (Dst == Vector2) { // Destination is already Vector2, Invert arguments and insert Vector1 on true. fcmgt(SubRegSize, VTMP1.Q(), Vector1.Q(), Vector2.Q()); bit(Dst.Q(), Vector1.Q(), VTMP1.Q()); } else { // Dst is not either source, need a move. fcmgt(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(Dst.Q(), Vector1.Q()); bit(Dst.Q(), Vector2.Q(), VTMP1.Q()); } } } DEF_OP(VFRecp) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = Op->Header.ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frecpe(SubRegSize.Vector, Dst.Z(), Vector.Z()); return; } fmov(SubRegSize.Vector, VTMP1.Z(), 1.0); fdiv(SubRegSize.Vector, VTMP1.Z(), Pred, VTMP1.Z(), Vector.Z()); mov(Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frecpe(SubRegSize.Scalar, Dst.S(), Vector.S()); return; } fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f); switch (ElementSize) { case IR::OpSize::i16Bit: { fdiv(Dst.H(), VTMP1.H(), Vector.H()); break; } case IR::OpSize::i32Bit: { fdiv(Dst.S(), VTMP1.S(), Vector.S()); break; } case IR::OpSize::i64Bit: { fdiv(Dst.D(), VTMP1.D(), Vector.D()); break; } default: { LOGMAN_MSG_A_FMT("Unexpected ElementSize for {}", __func__); FEX_UNREACHABLE; } } } else { if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. if (OpSize == IR::OpSize::i64Bit) { frecpe(SubRegSize.Vector, Dst.D(), Vector.D()); } else { frecpe(SubRegSize.Vector, Dst.Q(), Vector.Q()); } return; } fmov(SubRegSize.Vector, VTMP1.Q(), 1.0f); fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), Vector.Q()); } } } DEF_OP(VFRecpPrecision) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit, "Unexpected sizes for operation.", __func__); const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = OpSize == ElementSize; const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (IsScalar) { if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // Not enough precision so we need to improve it with frecps frecpe(SubRegSize.Scalar, VTMP1.S(), Vector.S()); frecps(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), Vector.S()); fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S()); return; } fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0f); // Element size is known to be 32bits fdiv(Dst.S(), VTMP1.S(), Vector.S()); } else { // Vector operation - Opsize 64bits, elementsize 32bits if (HostSupportsRPRES) { frecpe(SubRegSize.Vector, VTMP1.D(), Vector.D()); frecps(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), Vector.D()); fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D()); return; } // No RPRES, so normal division fmov(SubRegSize.Vector, VTMP1.Q(), 1.0f); fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), Vector.Q()); } } DEF_OP(VFRSqrt) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frsqrte(SubRegSize.Vector, Dst.Z(), Vector.Z()); return; } fsqrt(SubRegSize.Vector, VTMP1.Z(), Pred, Vector.Z()); fmov(SubRegSize.Vector, Dst.Z(), 1.0); fdiv(SubRegSize.Vector, Dst.Z(), Pred, Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. frsqrte(SubRegSize.Scalar, Dst.S(), Vector.S()); return; } fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0); switch (ElementSize) { case IR::OpSize::i16Bit: { fsqrt(VTMP2.H(), Vector.H()); fdiv(Dst.H(), VTMP1.H(), VTMP2.H()); break; } case IR::OpSize::i32Bit: { fsqrt(VTMP2.S(), Vector.S()); fdiv(Dst.S(), VTMP1.S(), VTMP2.S()); break; } case IR::OpSize::i64Bit: { fsqrt(VTMP2.D(), Vector.D()); fdiv(Dst.D(), VTMP1.D(), VTMP2.D()); break; } default: break; } } else { if (ElementSize == IR::OpSize::i32Bit && HostSupportsRPRES) { // RPRES gives enough precision for this. if (OpSize == IR::OpSize::i64Bit) { frsqrte(SubRegSize.Vector, Dst.D(), Vector.D()); } else { frsqrte(SubRegSize.Vector, Dst.Q(), Vector.Q()); } return; } fmov(SubRegSize.Vector, VTMP1.Q(), 1.0); fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q()); fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q()); } } } DEF_OP(VFRSqrtPrecision) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; LOGMAN_THROW_A_FMT((OpSize == IR::OpSize::i64Bit || OpSize == IR::OpSize::i32Bit) && ElementSize == IR::OpSize::i32Bit, "Unexpected sizes for operation.", __func__); const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (IsScalar) { if (HostSupportsRPRES) { frsqrte(SubRegSize.Scalar, VTMP1.S(), Vector.S()); // Improve initial estimate which is not good enough. fmul(SubRegSize.Scalar, VTMP2.S(), VTMP1.S(), VTMP1.S()); frsqrts(SubRegSize.Scalar, VTMP2.S(), VTMP2.S(), Vector.S()); fmul(SubRegSize.Scalar, Dst.S(), VTMP1.S(), VTMP2.S()); return; } fmov(SubRegSize.Scalar, VTMP1.Q(), 1.0); // element size is known to be 32bits fsqrt(VTMP2.S(), Vector.S()); fdiv(Dst.S(), VTMP1.S(), VTMP2.S()); } else { if (HostSupportsRPRES) { frsqrte(SubRegSize.Vector, VTMP1.D(), Vector.D()); // Improve initial estimate which is not good enough. fmul(SubRegSize.Vector, VTMP2.D(), VTMP1.D(), VTMP1.D()); frsqrts(SubRegSize.Vector, VTMP2.D(), VTMP2.D(), Vector.D()); fmul(SubRegSize.Vector, Dst.D(), VTMP1.D(), VTMP2.D()); return; } fmov(SubRegSize.Vector, VTMP1.Q(), 1.0); fsqrt(SubRegSize.Vector, VTMP2.Q(), Vector.Q()); fdiv(SubRegSize.Vector, Dst.Q(), VTMP1.Q(), VTMP2.Q()); } } DEF_OP(VNot) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { not_(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), PRED_TMP_32B.Merging(), Vector.Z()); } else { mvn(ARMEmitter::SubRegSize::i8Bit, Dst.Q(), Vector.Q()); } } DEF_OP(VUMin) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); // In any case where the destination aliases one of the source vectors // then we can just perform the UMIN in place. if (Dst == Vector1) { umin(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { umin(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector1.Z()); } else { // SVE UMIN is a destructive operation, but we know nothing is // aliasing the destination by this point, so we can move into // the destination without needing a temporary. movprfx(Dst.Z(), Vector1.Z()); umin(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } } else { switch (ElementSize) { case IR::OpSize::i8Bit: case IR::OpSize::i16Bit: case IR::OpSize::i32Bit: { umin(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } case IR::OpSize::i64Bit: { cmhi(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); mov(Dst.Q(), VTMP2.Q()); break; } default: break; } } } DEF_OP(VSMin) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); // In any case where the destination aliases one of the source vectors // then we can just perform the SMIN in place. if (Dst == Vector1) { smin(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { smin(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector1.Z()); } else { // SVE SMIN is a destructive operation, but we know nothing is // aliasing the destination by this point, so we can move into // the destination without needing a temporary. movprfx(Dst.Z(), Vector1.Z()); smin(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } } else { switch (ElementSize) { case IR::OpSize::i8Bit: case IR::OpSize::i16Bit: case IR::OpSize::i32Bit: { smin(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } case IR::OpSize::i64Bit: { cmgt(SubRegSize, VTMP1.Q(), Vector1.Q(), Vector2.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); mov(Dst.Q(), VTMP2.Q()); break; } default: break; } } } DEF_OP(VUMax) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); // In any case where the destination aliases one of the source vectors // then we can just perform the UMAX in place. if (Dst == Vector1) { umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector1.Z()); } else { // SVE UMAX is a destructive operation, but we know nothing is // aliasing the destination by this point, so we can move into // the destination without needing a temporary. movprfx(Dst.Z(), Vector1.Z()); umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } } else { switch (ElementSize) { case IR::OpSize::i8Bit: case IR::OpSize::i16Bit: case IR::OpSize::i32Bit: { umax(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } case IR::OpSize::i64Bit: { cmhi(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); mov(Dst.Q(), VTMP2.Q()); break; } default: break; } } } DEF_OP(VSMax) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Pred = PRED_TMP_32B.Merging(); // In any case where the destination aliases one of the source vectors // then we can just perform the SMAX in place. if (Dst == Vector1) { smax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { smax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector1.Z()); } else { // SVE SMAX is a destructive operation, but we know nothing is // aliasing the destination by this point, so we can move into // the destination without needing a temporary. movprfx(Dst.Z(), Vector1.Z()); smax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z()); } } else { switch (ElementSize) { case IR::OpSize::i8Bit: case IR::OpSize::i16Bit: case IR::OpSize::i32Bit: { smax(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); break; } case IR::OpSize::i64Bit: { cmgt(SubRegSize, VTMP1.Q(), Vector2.Q(), Vector1.Q()); mov(VTMP2.Q(), Vector1.Q()); bif(VTMP2.Q(), Vector2.Q(), VTMP1.Q()); mov(Dst.Q(), VTMP2.Q()); break; } default: break; } } } DEF_OP(VBSL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto VectorFalse = GetVReg(Op->VectorFalse); const auto VectorTrue = GetVReg(Op->VectorTrue); const auto VectorMask = GetVReg(Op->VectorMask); if (HostSupportsSVE256 && Is256Bit) { // NOTE: Slight parameter difference from ASIMD // ASIMD -> BSL Mask, True, False // SVE -> BSL True, True, False, Mask // ASIMD -> BIT True, False, Mask // ASIMD -> BIF False, True, Mask if (Dst == VectorTrue) { // Trivial case where we can perform the operation in place. bsl(Dst.Z(), Dst.Z(), VectorFalse.Z(), VectorMask.Z()); } else { movprfx(VTMP1.Z(), VectorTrue.Z()); bsl(VTMP1.Z(), VTMP1.Z(), VectorFalse.Z(), VectorMask.Z()); mov(Dst.Z(), VTMP1.Z()); } } else if (!HostSupportsSVE256 && HostSupportsSVE128 && Is128Bit && Dst != VectorFalse && Dst != VectorTrue && Dst != VectorMask) { // Needs to move but SVE movprfx+bsl is slightly more efficient than ASIMD mov+bsl on CPUs that support // movprfx fusion and NOT zero-cycle vector register moves. movprfx(Dst.Z(), VectorTrue.Z()); bsl(Dst.Z(), Dst.Z(), VectorFalse.Z(), VectorMask.Z()); } else { if (VectorMask == Dst) { // Can use BSL without any moves. if (OpSize == IR::OpSize::i64Bit) { bsl(Dst.D(), VectorTrue.D(), VectorFalse.D()); } else { bsl(Dst.Q(), VectorTrue.Q(), VectorFalse.Q()); } } else if (VectorTrue == Dst) { // Can use BIF without any moves. if (OpSize == IR::OpSize::i64Bit) { bif(Dst.D(), VectorFalse.D(), VectorMask.D()); } else { bif(Dst.Q(), VectorFalse.Q(), VectorMask.Q()); } } else if (VectorFalse == Dst) { // Can use BIT without any moves. if (OpSize == IR::OpSize::i64Bit) { bit(Dst.D(), VectorTrue.D(), VectorMask.D()); } else { bit(Dst.Q(), VectorTrue.Q(), VectorMask.Q()); } } else { // Needs moves. if (OpSize == IR::OpSize::i64Bit) { mov(Dst.D(), VectorMask.D()); bsl(Dst.D(), VectorTrue.D(), VectorFalse.D()); } else { mov(Dst.Q(), VectorMask.Q()); bsl(Dst.Q(), VectorTrue.Q(), VectorFalse.Q()); } } } } DEF_OP(VCMPEQ) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { // FIXME: We should rework this op to avoid the NZCV spill/fill dance. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; // General idea is to compare for equality, not the equal vals // from one of the registers, then or both together to make the // relevant equal entries all 1s. cmpeq(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); // Restore NZCV msr(ARMEmitter::SystemRegister::NZCV, TMP1); } else { if (IsScalar) { cmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); } else { cmeq(SubRegSize.Vector, Dst.Q(), Vector1.Q(), Vector2.Q()); } } } DEF_OP(VCMPEQZ) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; // FIXME: We should rework this op to avoid the NZCV spill/fill dance. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); // Ensure no junk is in the temp (important for ensuring // non-equal entries remain as zero). mov_imm(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), 0); // Unlike with VCMPEQ, we can skip needing to bitwise OR the // final results, since if our elements are equal to zero, // we just need to bitwise NOT them and they're already set // to all 1s. cmpeq(SubRegSize.Vector, ComparePred, Mask, Vector.Z(), 0); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector.Z()); mov(Dst.Z(), VTMP1.Z()); // Restore NZCV msr(ARMEmitter::SystemRegister::NZCV, TMP1); } else { if (IsScalar) { cmeq(SubRegSize.Scalar, Dst, Vector); } else { cmeq(SubRegSize.Vector, Dst.Q(), Vector.Q()); } } } DEF_OP(VCMPGT) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; // FIXME: We should rework this op to avoid the NZCV spill/fill dance. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); // General idea is to compare for greater-than, bitwise NOT // the valid values, then ORR the NOTed values with the original // values to form entries that are all 1s. cmpgt(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); // Restore NZCV msr(ARMEmitter::SystemRegister::NZCV, TMP1); } else { if (IsScalar) { cmgt(SubRegSize.Scalar, Dst, Vector1, Vector2); } else { cmgt(SubRegSize.Vector, Dst.Q(), Vector1.Q(), Vector2.Q()); } } } DEF_OP(VCMPGTZ) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; // FIXME: We should rework this op to avoid the NZCV spill/fill dance. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); // Ensure no junk is in the temp (important for ensuring // non greater-than values remain as zero). mov_imm(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), 0); cmpgt(SubRegSize.Vector, ComparePred, Mask, Vector.Z(), 0); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector.Z()); orr(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), VTMP1.Z(), Vector.Z()); mov(Dst.Z(), VTMP1.Z()); // Restore NZCV msr(ARMEmitter::SystemRegister::NZCV, TMP1); } else { if (IsScalar) { cmgt(SubRegSize.Scalar, Dst, Vector); } else { cmgt(SubRegSize.Vector, Dst.Q(), Vector.Q()); } } } DEF_OP(VCMPLTZ) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair16(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; // FIXME: We should rework this op to avoid the NZCV spill/fill dance. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); // Ensure no junk is in the temp (important for ensuring // non less-than values remain as zero). mov_imm(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), 0); cmplt(SubRegSize.Vector, ComparePred, Mask, Vector.Z(), 0); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector.Z()); orr(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), VTMP1.Z(), Vector.Z()); mov(Dst.Z(), VTMP1.Z()); // Restore NZCV msr(ARMEmitter::SystemRegister::NZCV, TMP1); } else { if (IsScalar) { cmlt(SubRegSize.Scalar, Dst, Vector); } else { cmlt(SubRegSize.Vector, Dst.Q(), Vector.Q()); } } } DEF_OP(VFCMPEQ) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; fcmeq(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmeq(Dst.H(), Vector1.H(), Vector2.H()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); break; default: break; } } else { fcmeq(SubRegSize.Vector, Dst.Q(), Vector1.Q(), Vector2.Q()); } } } DEF_OP(VFCMPNEQ) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; fcmne(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmeq(Dst.H(), Vector1.H(), Vector2.H()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmeq(SubRegSize.Scalar, Dst, Vector1, Vector2); break; default: break; } mvn(ARMEmitter::SubRegSize::i8Bit, Dst.D(), Dst.D()); } else { fcmeq(SubRegSize.Vector, Dst.Q(), Vector1.Q(), Vector2.Q()); mvn(ARMEmitter::SubRegSize::i8Bit, Dst.Q(), Dst.Q()); } } } DEF_OP(VFCMPLT) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; fcmgt(SubRegSize.Vector, ComparePred, Mask, Vector2.Z(), Vector1.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector2.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector2.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmgt(Dst.H(), Vector2.H(), Vector1.H()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmgt(SubRegSize.Scalar, Dst, Vector2, Vector1); break; default: break; } } else { fcmgt(SubRegSize.Vector, Dst.Q(), Vector2.Q(), Vector1.Q()); } } } DEF_OP(VFCMPGT) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; fcmgt(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmgt(Dst.H(), Vector1.H(), Vector2.H()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmgt(SubRegSize.Scalar, Dst, Vector1, Vector2); break; default: break; } } else { fcmgt(SubRegSize.Vector, Dst.Q(), Vector1.Q(), Vector2.Q()); } } } DEF_OP(VFCMPLE) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; fcmge(SubRegSize.Vector, ComparePred, Mask, Vector2.Z(), Vector1.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector2.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector2.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmge(Dst.H(), Vector2.H(), Vector1.H()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmge(SubRegSize.Scalar, Dst, Vector2, Vector1); break; default: break; } } else { fcmge(SubRegSize.Vector, Dst.Q(), Vector2.Q(), Vector1.Q()); } } } DEF_OP(VFCMPORD) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; // The idea is like comparing for unordered, but we just // invert the predicate from the comparison to instead // select all ordered elements in the vector. fcmuo(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(ComparePred, Mask, ComparePred); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmge(VTMP1.H(), Vector1.H(), Vector2.H()); fcmgt(VTMP2.H(), Vector2.H(), Vector1.H()); orr(Dst.D(), VTMP1.D(), VTMP2.D()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmge(SubRegSize.Scalar, VTMP1, Vector1, Vector2); fcmgt(SubRegSize.Scalar, VTMP2, Vector2, Vector1); orr(Dst.D(), VTMP1.D(), VTMP2.D()); break; default: break; } } else { fcmge(SubRegSize.Vector, VTMP1.Q(), Vector1.Q(), Vector2.Q()); fcmgt(SubRegSize.Vector, VTMP2.Q(), Vector2.Q(), Vector1.Q()); orr(Dst.Q(), VTMP1.Q(), VTMP2.Q()); } } } DEF_OP(VFCMPUNO) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSizePair248(IROp); const auto IsScalar = ElementSize == OpSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Zeroing(); const auto ComparePred = ARMEmitter::PReg::p0; fcmuo(SubRegSize.Vector, ComparePred, Mask, Vector1.Z(), Vector2.Z()); not_(SubRegSize.Vector, VTMP1.Z(), ComparePred.Merging(), Vector1.Z()); movprfx(SubRegSize.Vector, Dst.Z(), ComparePred.Zeroing(), Vector1.Z()); orr(SubRegSize.Vector, Dst.Z(), ComparePred.Merging(), Dst.Z(), VTMP1.Z()); } else { if (IsScalar) { switch (ElementSize) { case IR::OpSize::i16Bit: { fcmge(VTMP1.H(), Vector1.H(), Vector2.H()); fcmgt(VTMP2.H(), Vector2.H(), Vector1.H()); orr(Dst.D(), VTMP1.D(), VTMP2.D()); mvn(ARMEmitter::SubRegSize::i8Bit, Dst.D(), Dst.D()); break; } case IR::OpSize::i32Bit: case IR::OpSize::i64Bit: fcmge(SubRegSize.Scalar, VTMP1, Vector1, Vector2); fcmgt(SubRegSize.Scalar, VTMP2, Vector2, Vector1); orr(Dst.D(), VTMP1.D(), VTMP2.D()); mvn(ARMEmitter::SubRegSize::i8Bit, Dst.D(), Dst.D()); break; default: break; } } else { fcmge(SubRegSize.Vector, VTMP1.Q(), Vector1.Q(), Vector2.Q()); fcmgt(SubRegSize.Vector, VTMP2.Q(), Vector2.Q(), Vector1.Q()); orr(Dst.Q(), VTMP1.Q(), VTMP2.Q()); mvn(ARMEmitter::SubRegSize::i8Bit, Dst.Q(), Dst.Q()); } } } DEF_OP(VUShl) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = IROp->ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MaxShift = IR::OpSizeAsBits(ElementSize); const auto Dst = GetVReg(Node); auto ShiftVector = GetVReg(Op->ShiftVector); const auto Vector = GetVReg(Op->Vector); const auto RangeCheck = Op->RangeCheck; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (RangeCheck) { dup_imm(SubRegSize, VTMP2.Z(), MaxShift); umin(SubRegSize, VTMP2.Z(), Mask, VTMP2.Z(), ShiftVector.Z()); ShiftVector = VTMP2; } if (Dst == ShiftVector) { // If destination aliases the shift vector then we need to move it temporarily. mov(VTMP2.Z(), ShiftVector.Z()); ShiftVector = VTMP2; } // If Dst aliases Vector, then we can skip the move. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftVector.Z()); } else { if (RangeCheck) { if (ElementSize < IR::OpSize::i64Bit) { movi(SubRegSize, VTMP1.Q(), MaxShift); umin(SubRegSize, VTMP1.Q(), VTMP1.Q(), ShiftVector.Q()); } else { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, MaxShift); dup(SubRegSize, VTMP1.Q(), TMP1.R()); // UMIN is silly on Adv.SIMD and doesn't have a variant that handles 64-bit elements cmhi(SubRegSize, VTMP2.Q(), ShiftVector.Q(), VTMP1.Q()); bif(VTMP1.Q(), ShiftVector.Q(), VTMP2.Q()); } ShiftVector = VTMP1; } ushl(SubRegSize, Dst.Q(), Vector.Q(), ShiftVector.Q()); } } DEF_OP(VUShr) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = IROp->ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MaxShift = IR::OpSizeAsBits(ElementSize); const auto Dst = GetVReg(Node); auto ShiftVector = GetVReg(Op->ShiftVector); const auto Vector = GetVReg(Op->Vector); const auto RangeCheck = Op->RangeCheck; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (RangeCheck) { dup_imm(SubRegSize, VTMP2.Z(), MaxShift); umin(SubRegSize, VTMP2.Z(), Mask, VTMP2.Z(), ShiftVector.Z()); ShiftVector = VTMP2; } if (Dst == ShiftVector) { // If destination aliases the shift vector then we need to move it temporarily. mov(VTMP2.Z(), ShiftVector.Z()); ShiftVector = VTMP2; } // If Dst aliases Vector, then we can skip the move. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftVector.Z()); } else { if (RangeCheck) { if (ElementSize < IR::OpSize::i64Bit) { movi(SubRegSize, VTMP1.Q(), MaxShift); umin(SubRegSize, VTMP1.Q(), VTMP1.Q(), ShiftVector.Q()); } else { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, MaxShift); dup(SubRegSize, VTMP1.Q(), TMP1.R()); // UMIN is silly on Adv.SIMD and doesn't have a variant that handles 64-bit elements cmhi(SubRegSize, VTMP2.Q(), ShiftVector.Q(), VTMP1.Q()); bif(VTMP1.Q(), ShiftVector.Q(), VTMP2.Q()); } ShiftVector = VTMP1; } // Need to invert shift values to perform a right shift with USHL // (USHR only has an immediate variant). neg(SubRegSize, VTMP1.Q(), ShiftVector.Q()); ushl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VSShr) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = IROp->ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto MaxShift = IR::OpSizeAsBits(ElementSize) - 1; const auto RangeCheck = Op->RangeCheck; const auto Dst = GetVReg(Node); auto ShiftVector = GetVReg(Op->ShiftVector); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (RangeCheck) { dup_imm(SubRegSize, VTMP1.Z(), MaxShift); umin(SubRegSize, VTMP1.Z(), Mask, VTMP1.Z(), ShiftVector.Z()); ShiftVector = VTMP1; } if (Dst == ShiftVector) { // If destination aliases the shift vector then we need to move it temporarily. mov(VTMP1.Z(), ShiftVector.Z()); ShiftVector = VTMP1; } // If Dst aliases Vector, then we can skip the move. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftVector.Z()); } else { if (RangeCheck) { if (ElementSize < IR::OpSize::i64Bit) { movi(SubRegSize, VTMP1.Q(), MaxShift); umin(SubRegSize, VTMP1.Q(), VTMP1.Q(), ShiftVector.Q()); } else { LoadConstant(ARMEmitter::Size::i64Bit, TMP1, MaxShift); dup(SubRegSize, VTMP1.Q(), TMP1.R()); // UMIN is silly on Adv.SIMD and doesn't have a variant that handles 64-bit elements cmhi(SubRegSize, VTMP2.Q(), ShiftVector.Q(), VTMP1.Q()); bif(VTMP1.Q(), ShiftVector.Q(), VTMP2.Q()); } ShiftVector = VTMP1; } // Need to invert shift values to perform a right shift with SSHL // (SSHR only has an immediate variant). neg(SubRegSize, VTMP1.Q(), ShiftVector.Q()); sshl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VUShlS) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto ShiftScalar = GetVReg(Op->ShiftScalar); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); // NOTE: SVE LSL is a destructive operation, so we need to // move the vector into the destination if they don't // already alias. dup(SubRegSize, VTMP1.Z(), ShiftScalar.Z(), 0); if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { dup(SubRegSize, VTMP1.Q(), ShiftScalar.Q(), 0); ushl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VUShrS) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto ShiftScalar = GetVReg(Op->ShiftScalar); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); // NOTE: SVE LSR is a destructive operation, so we need to // move the vector into the destination if they don't // already alias. dup(SubRegSize, VTMP1.Z(), ShiftScalar.Z(), 0); if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { dup(SubRegSize, VTMP1.Q(), ShiftScalar.Q(), 0); neg(SubRegSize, VTMP1.Q(), VTMP1.Q()); ushl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VUShrSWide) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto ShiftScalar = GetVReg(Op->ShiftScalar); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); if (Dst != Vector) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } if (ElementSize == IR::OpSize::i64Bit) { lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { lsr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } } else if (HostSupportsSVE128) { const auto Mask = PRED_TMP_16B.Merging(); auto ShiftRegister = ShiftScalar; if (OpSize > IR::OpSize::i64Bit) { // SVE wide shifts don't need to duplicate the low bits unless the OpSize is 16-bytes // Slightly more optimal for 8-byte opsize. dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); ShiftRegister = VTMP1; } if (Dst == ShiftRegister) { // If destination aliases the shift vector then we need to move it temporarily. mov(VTMP1.Z(), ShiftRegister.Z()); ShiftRegister = VTMP1; } if (Dst != Vector) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } if (ElementSize == IR::OpSize::i64Bit) { lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } else { lsr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } } else { // uqshl + ushr of 57-bits leaves 7-bits remaining. // This saturates the 64-bit shift value from an arbitrary 64-bit length // variable to maximum of 0x7F. // This allows the shift to fit within the width of the signed 8-bits // that ASIMD's vector shift requires. uqshl(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, ShiftScalar, 57); ushr(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, VTMP1, 57); dup(SubRegSize, VTMP1.Q(), VTMP1.Q(), 0); neg(SubRegSize, VTMP1.Q(), VTMP1.Q()); ushl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VSShrSWide) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto ShiftScalar = GetVReg(Op->ShiftScalar); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); if (Dst != Vector) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } if (ElementSize == IR::OpSize::i64Bit) { asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { asr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } } else if (HostSupportsSVE128) { const auto Mask = PRED_TMP_16B.Merging(); auto ShiftRegister = ShiftScalar; if (OpSize > IR::OpSize::i64Bit) { // SVE wide shifts don't need to duplicate the low bits unless the OpSize is 16-bytes // Slightly more optimal for 8-byte opsize. dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); ShiftRegister = VTMP1; } if (Dst == ShiftRegister) { // If destination aliases the shift vector then we need to move it temporarily. mov(VTMP1.Z(), ShiftRegister.Z()); ShiftRegister = VTMP1; } if (Dst != Vector) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } if (ElementSize == IR::OpSize::i64Bit) { asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } else { asr_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } } else { // uqshl + ushr of 57-bits leaves 7-bits remaining. // This saturates the 64-bit shift value from an arbitrary 64-bit length // variable to maximum of 0x7F. // This allows the shift to fit within the width of the signed 8-bits // that ASIMD's vector shift requires. uqshl(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, ShiftScalar, 57); ushr(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, VTMP1, 57); dup(SubRegSize, VTMP1.Q(), VTMP1.Q(), 0); neg(SubRegSize, VTMP1.Q(), VTMP1.Q()); sshl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VUShlSWide) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto ShiftScalar = GetVReg(Op->ShiftScalar); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); if (Dst != Vector) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } if (ElementSize == IR::OpSize::i64Bit) { lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { lsl_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } } else if (HostSupportsSVE128) { const auto Mask = PRED_TMP_16B.Merging(); auto ShiftRegister = ShiftScalar; if (OpSize > IR::OpSize::i64Bit) { // SVE wide shifts don't need to duplicate the low bits unless the OpSize is 16-bytes // Slightly more optimal for 8-byte opsize. dup(ARMEmitter::SubRegSize::i64Bit, VTMP1.Z(), ShiftScalar.Z(), 0); ShiftRegister = VTMP1; } if (Dst == ShiftRegister) { // If destination aliases the shift vector then we need to move it temporarily. mov(VTMP1.Z(), ShiftRegister.Z()); ShiftRegister = VTMP1; } if (Dst != Vector) { // NOTE: SVE LSR is a destructive operation. movprfx(Dst.Z(), Vector.Z()); } if (ElementSize == IR::OpSize::i64Bit) { lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } else { lsl_wide(SubRegSize, Dst.Z(), Mask, Dst.Z(), ShiftRegister.Z()); } } else { // uqshl + ushr of 57-bits leaves 7-bits remaining. // This saturates the 64-bit shift value from an arbitrary 64-bit length // variable to maximum of 0x7F. // This allows the shift to fit within the width of the signed 8-bits // that ASIMD's vector shift requires. uqshl(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, ShiftScalar, 57); ushr(ARMEmitter::ScalarRegSize::i64Bit, VTMP1, VTMP1, 57); dup(SubRegSize, VTMP1.Q(), VTMP1.Q(), 0); ushl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VSShrS) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto ShiftScalar = GetVReg(Op->ShiftScalar); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); // NOTE: SVE ASR is a destructive operation, so we need to // move the vector into the destination if they don't // already alias. dup(SubRegSize, VTMP1.Z(), ShiftScalar.Z(), 0); if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP1.Z()); } else { dup(SubRegSize, VTMP1.Q(), ShiftScalar.Q(), 0); neg(SubRegSize, VTMP1.Q(), VTMP1.Q()); sshl(SubRegSize, Dst.Q(), Vector.Q(), VTMP1.Q()); } } DEF_OP(VInsElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize16(IROp); const uint32_t DestIdx = Op->DestIdx; const uint32_t SrcIdx = Op->SrcIdx; const auto Dst = GetVReg(Node); const auto SrcVector = GetVReg(Op->SrcVector); auto Reg = GetVReg(Op->DestVector); if (HostSupportsSVE256 && Is256Bit) { // Broadcast our source value across a temporary, // then combine with the destination. dup(SubRegSize, VTMP2.Z(), SrcVector.Z(), SrcIdx); // We don't need to move the data unnecessarily if // DestVector just so happens to also be the IR op // destination. if (Dst != Reg) { mov(Dst.Z(), Reg.Z()); } constexpr auto Predicate = ARMEmitter::PReg::p0; if (ElementSize == IR::OpSize::i128Bit) { if (DestIdx == 0) { mov(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), PRED_TMP_16B.Merging(), VTMP2.Z()); } else { not_(Predicate, PRED_TMP_32B.Zeroing(), PRED_TMP_16B); mov(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), Predicate.Merging(), VTMP2.Z()); } } else { const auto UpperBound = 16 >> FEXCore::ilog2(IR::OpSizeToSize(ElementSize)); const auto TargetElement = static_cast(DestIdx) - UpperBound; // FIXME: We should rework this op to avoid the NZCV spill/fill dance. mrs(TMP1, ARMEmitter::SystemRegister::NZCV); index(SubRegSize, VTMP1.Z(), -UpperBound, 1); cmpeq(SubRegSize, Predicate, PRED_TMP_32B.Zeroing(), VTMP1.Z(), TargetElement); mov(SubRegSize, Dst.Z(), Predicate.Merging(), VTMP2.Z()); // Restore NZCV msr(ARMEmitter::SystemRegister::NZCV, TMP1); } } else { // If nothing aliases the destination, then we can just // move the DestVector over and directly insert. if (Dst != Reg && Dst != SrcVector) { mov(Dst.Q(), Reg.Q()); ins(SubRegSize, Dst.Q(), DestIdx, SrcVector.Q(), SrcIdx); return; } // If our vector data to insert into is within a register // that aliases the destination, then we can avoid using a // temporary and just perform the insert. // // Otherwise, if the source vector to select from aliases // the destination, then we hit the worst case where we // need to use a temporary to avoid clobbering data. if (Dst != Reg) { mov(VTMP1.Q(), Reg.Q()); Reg = VTMP1; } ins(SubRegSize, Reg.Q(), DestIdx, SrcVector.Q(), SrcIdx); if (Dst != Reg) { mov(Dst.Q(), Reg.Q()); } } } DEF_OP(VDupElement) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Index = Op->Index; const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { dup(SubRegSize, Dst.Z(), Vector.Z(), Index); } else { if (Is128Bit) { dup(SubRegSize, Dst.Q(), Vector.Q(), Index); } else { dup(SubRegSize, Dst.D(), Vector.D(), Index); } } } DEF_OP(VExtr) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); // AArch64 ext op has bit arrangement as [Vm:Vn] so arguments need to be swapped const auto Dst = GetVReg(Node); auto UpperBits = GetVReg(Op->VectorLower); auto LowerBits = GetVReg(Op->VectorUpper); const auto ElementSize = Op->Header.ElementSize; auto Index = Op->Index; if (Index >= IR::OpSizeToSize(OpSize)) { // Upper bits have moved in to the lower bits LowerBits = UpperBits; // Upper bits are all now zero UpperBits = VTMP1; movi(ARMEmitter::SubRegSize::i64Bit, VTMP1.Q(), 0); Index -= IR::OpSizeToSize(OpSize); } const auto CopyFromByte = Index * IR::OpSizeToSize(ElementSize); if (HostSupportsSVE256 && Is256Bit) { if (Dst == LowerBits) { // Trivial case where we don't need to do any moves ext(Dst.Z(), Dst.Z(), UpperBits.Z(), CopyFromByte); } else if (Dst == UpperBits) { movprfx(VTMP2.Z(), LowerBits.Z()); ext(VTMP2.Z(), VTMP2.Z(), UpperBits.Z(), CopyFromByte); mov(Dst.Z(), VTMP2.Z()); } else { // No registers alias the destination, so we can safely move into it. movprfx(Dst.Z(), LowerBits.Z()); ext(Dst.Z(), Dst.Z(), UpperBits.Z(), CopyFromByte); } } else { if (OpSize == IR::OpSize::i64Bit) { ext(Dst.D(), LowerBits.D(), UpperBits.D(), CopyFromByte); } else { ext(Dst.Q(), LowerBits.Q(), UpperBits.Q(), CopyFromByte); } } } DEF_OP(VUShrI) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto BitShift = Op->BitShift; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (BitShift >= IR::OpSizeAsBits(ElementSize)) { movi(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0); } else { if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (BitShift == 0) { if (Dst != Vector) { mov(Dst.Z(), Vector.Z()); } } else { // SVE LSR is destructive, so lets set up the destination if // Vector doesn't already alias it. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } lsr(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift); } } else { if (BitShift == 0) { if (Dst != Vector) { mov(Dst.Q(), Vector.Q()); } } else { ushr(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } } } } DEF_OP(VUShraI) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto BitShift = Op->BitShift; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto DestVector = GetVReg(Op->DestVector); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { if (Dst == DestVector) { usra(SubRegSize, Dst.Z(), Vector.Z(), BitShift); } else { if (Dst != Vector) { mov(Dst.Z(), DestVector.Z()); usra(SubRegSize, Dst.Z(), Vector.Z(), BitShift); } else { mov(VTMP1.Z(), DestVector.Z()); usra(SubRegSize, Dst.Z(), Vector.Z(), BitShift); mov(Dst.Z(), VTMP1.Z()); } } } else { if (Dst == DestVector) { usra(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } else { if (Dst != Vector) { mov(Dst.Q(), DestVector.Q()); usra(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } else { mov(VTMP1.Q(), DestVector.Q()); usra(SubRegSize, VTMP1.Q(), Vector.Q(), BitShift); mov(Dst.Q(), VTMP1.Q()); } } } } DEF_OP(VSShrI) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; LOGMAN_THROW_A_FMT(ElementSize >= IR::OpSize::i8Bit && ElementSize <= IR::OpSize::i64Bit, "Invalid element size"); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Shift = std::min(IR::OpSizeAsBits(ElementSize) - 1, Op->BitShift); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (Shift == 0) { if (Dst != Vector) { mov(Dst.Z(), Vector.Z()); } } else { // SVE ASR is destructive, so lets set up the destination if // Vector doesn't already alias it. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } asr(SubRegSize, Dst.Z(), Mask, Dst.Z(), Shift); } } else { if (Shift == 0) { if (Dst != Vector) { mov(Dst.Q(), Vector.Q()); } } else { sshr(SubRegSize, Dst.Q(), Vector.Q(), Shift); } } } DEF_OP(VShlI) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto BitShift = Op->BitShift; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (BitShift >= IR::OpSizeAsBits(ElementSize)) { movi(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), 0); } else { if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (BitShift == 0) { if (Dst != Vector) { mov(Dst.Z(), Vector.Z()); } } else { // SVE LSL is destructive, so lets set up the destination if // Vector doesn't already alias it. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } lsl(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift); } } else { if (BitShift == 0) { if (Dst != Vector) { mov(Dst.Q(), Vector.Q()); } } else { shl(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } } } } DEF_OP(VUShrNI) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto BitShift = Op->BitShift; const auto SubRegSize = ConvertSubRegSize4(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { shrnb(SubRegSize, Dst.Z(), Vector.Z(), BitShift); uzp1(SubRegSize, Dst.Z(), Dst.Z(), Dst.Z()); } else { if (BitShift == 0) { xtn(SubRegSize, Dst.D(), Vector.D()); } else { shrn(SubRegSize, Dst.D(), Vector.D(), BitShift); } } } DEF_OP(VUShrNI2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto BitShift = Op->BitShift; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); const auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_16B; shrnb(SubRegSize, VTMP2.Z(), VectorUpper.Z(), BitShift); uzp1(SubRegSize, VTMP2.Z(), VTMP2.Z(), VTMP2.Z()); if (Dst != VectorLower) { movprfx(Dst.Z(), VectorLower.Z()); } splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { auto Lower = VectorLower; if (Dst != VectorLower) { mov(VTMP1.Q(), VectorLower.Q()); Lower = VTMP1; } shrn2(SubRegSize, Lower.Q(), VectorUpper.Q(), BitShift); if (Dst != VectorLower) { mov(Dst.Q(), Lower.Q()); } } } DEF_OP(VSXTL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if ((HostSupportsSVE128 && !Is256Bit && !HostSupportsSVE256) || (HostSupportsSVE256 && Is256Bit)) { sunpklo(SubRegSize, Dst.Z(), Vector.Z()); } else { sxtl(SubRegSize, Dst.D(), Vector.D()); } } DEF_OP(VSXTL2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if ((HostSupportsSVE128 && !Is256Bit && !HostSupportsSVE256) || (HostSupportsSVE256 && Is256Bit)) { sunpkhi(SubRegSize, Dst.Z(), Vector.Z()); } else { sxtl2(SubRegSize, Dst.Q(), Vector.Q()); } } DEF_OP(VSSHLL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); const auto BitShift = Op->BitShift; LOGMAN_THROW_A_FMT(BitShift < IR::OpSizeAsBits(IROp->ElementSize / 2), "Bitshift size too large for source element size: {} < {}", BitShift, IR::OpSizeAsBits(IROp->ElementSize / 2)); if (Is256Bit) { sunpklo(SubRegSize, Dst.Z(), Vector.Z()); lsl(SubRegSize, Dst.Z(), Dst.Z(), BitShift); } else { sshll(SubRegSize, Dst.D(), Vector.D(), BitShift); } } DEF_OP(VSSHLL2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); const auto BitShift = Op->BitShift; LOGMAN_THROW_A_FMT(BitShift < IR::OpSizeAsBits(IROp->ElementSize / 2), "Bitshift size too large for source element size: {} < {}", BitShift, IR::OpSizeAsBits(IROp->ElementSize / 2)); if (Is256Bit) { sunpkhi(SubRegSize, Dst.Z(), Vector.Z()); lsl(SubRegSize, Dst.Z(), Dst.Z(), BitShift); } else { sshll2(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } } DEF_OP(VUXTL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if ((HostSupportsSVE128 && !Is256Bit && !HostSupportsSVE256) || (HostSupportsSVE256 && Is256Bit)) { uunpklo(SubRegSize, Dst.Z(), Vector.Z()); } else { uxtl(SubRegSize, Dst.D(), Vector.D()); } } DEF_OP(VUXTL2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if ((HostSupportsSVE128 && !Is256Bit && !HostSupportsSVE256) || (HostSupportsSVE256 && Is256Bit)) { uunpkhi(SubRegSize, Dst.Z(), Vector.Z()); } else { uxtl2(SubRegSize, Dst.Q(), Vector.Q()); } } DEF_OP(VSQXTN) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { // Note that SVE SQXTNB and SQXTNT are a tad different // in behavior compared to most other [name]B and [name]T // instructions. // // Most other bottom and top instructions operate // on even (bottom) or odd (top) elements and store each // result into the next subsequent element in the destination // vector // // SQXTNB and SQXTNT will operate on the same elements regardless // of which one is chosen, but will instead place results from // the operation into either each subsequent even (bottom) element // or odd (top) element. However the bottom instruction will zero the // odd elements out in the destination vector, while the top instruction // will leave the even elements alone (in a behavior similar to Adv.SIMD's // SQXTN/SQXTN2 instructions). // // e.g. consider this 64-bit (for brevity) vector with four 16-bit elements: // // ╔═══════════╗╔═══════════╗╔═══════════╗╔═══════════╗ // ║ Value 3 ║║ Value 2 ║║ Value 1 ║║ Value 0 ║ // ╚═══════════╝╚═══════════╝╚═══════════╝╚═══════════╝ // // SQXTNB Dst.VnB, Src.VnH will result in: // // ╔═════╗╔═════╗╔═════╗╔═════╗╔═════╗╔═════╗╔═════╗╔═════╗ // ║ 0 ║║ V3 ║║ 0 ║║ V2 ║║ 0 ║║ V1 ║║ 0 ║║ V0 ║ // ╚═════╝╚═════╝╚═════╝╚═════╝╚═════╝╚═════╝╚═════╝╚═════╝ // // This is kind of convenient, considering we only need // to use the bottom variant and then concatenate all the // even elements with SVE UZP1. sqxtnb(SubRegSize, Dst.Z(), Vector.Z()); uzp1(SubRegSize, Dst.Z(), Dst.Z(), Dst.Z()); } else { sqxtn(SubRegSize, Dst, Vector); } } DEF_OP(VSQXTN2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); const auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { // We use the 16 byte mask due to how SPLICE works. We only // want to get at the first 16 bytes in the lower vector, so // that SPLICE will then begin copying the first 16 bytes // from the upper vector and begin placing them after the // previously copied lower 16 bytes. const auto Mask = PRED_TMP_16B; sqxtnb(SubRegSize, VTMP2.Z(), VectorUpper.Z()); uzp1(SubRegSize, VTMP2.Z(), VTMP2.Z(), VTMP2.Z()); // Need to use the destructive variant of SPLICE, since // the constructive variant requires a register list, and // we can't guarantee VectorLower and VectorUpper will always // have consecutive indexes with one another. if (Dst != VectorLower) { movprfx(Dst.Z(), VectorLower.Z()); } splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { if (OpSize == IR::OpSize::i64Bit) { sqxtn(SubRegSize, VTMP2, VectorUpper); mov(Dst.Q(), VectorLower.Q()); ins(ARMEmitter::SubRegSize::i32Bit, Dst, 1, VTMP2, 0); } else { mov(VTMP1.Q(), VectorLower.Q()); sqxtn2(SubRegSize, VTMP1, VectorUpper); mov(Dst.Q(), VTMP1.Q()); } } } DEF_OP(VSQXTNPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { // This combines the SVE versions of VSQXTN/VSQXTN2. // Upper VSQXTN2 handling. // Doing upper first to ensure it doesn't get overwritten by lower calculation. const auto Mask = PRED_TMP_16B; sqxtnb(SubRegSize, VTMP2.Z(), VectorUpper.Z()); uzp1(SubRegSize, VTMP2.Z(), VTMP2.Z(), VTMP2.Z()); // Look at those implementations for details about this. // Lower VSQXTN handling. sqxtnb(SubRegSize, Dst.Z(), VectorLower.Z()); uzp1(SubRegSize, Dst.Z(), Dst.Z(), Dst.Z()); // Merge. splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { if (OpSize == IR::OpSize::i64Bit) { zip1(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); sqxtn(SubRegSize, Dst, Dst); } else { if (Dst == VectorUpper) { // If the destination overlaps the upper then we need to move it temporarily. mov(VTMP1.Q(), VectorUpper.Q()); VectorUpper = VTMP1; } sqxtn(SubRegSize, Dst, VectorLower); sqxtn2(SubRegSize, Dst, VectorUpper); } } } DEF_OP(VSQXTUN) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { sqxtunb(SubRegSize, Dst.Z(), Vector.Z()); uzp1(SubRegSize, Dst.Z(), Dst.Z(), Dst.Z()); } else { sqxtun(SubRegSize, Dst, Vector); } } DEF_OP(VSQXTUN2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); const auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { // NOTE: See VSQXTN2 implementation for an in-depth explanation // of everything going on here. const auto Mask = PRED_TMP_16B; sqxtunb(SubRegSize, VTMP2.Z(), VectorUpper.Z()); uzp1(SubRegSize, VTMP2.Z(), VTMP2.Z(), VTMP2.Z()); if (Dst != VectorLower) { movprfx(Dst.Z(), VectorLower.Z()); } splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { if (OpSize == IR::OpSize::i64Bit) { sqxtun(SubRegSize, VTMP2, VectorUpper); mov(Dst.Q(), VectorLower.Q()); ins(ARMEmitter::SubRegSize::i32Bit, Dst, 1, VTMP2, 0); } else { auto Lower = VectorLower; if (Dst != VectorLower) { mov(VTMP1.Q(), VectorLower.Q()); Lower = VTMP1; } sqxtun2(SubRegSize, Lower, VectorUpper); if (Dst != VectorLower) { mov(Dst.Q(), Lower.Q()); } } } } DEF_OP(VSQXTUNPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize4(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto VectorLower = GetVReg(Op->VectorLower); auto VectorUpper = GetVReg(Op->VectorUpper); if (HostSupportsSVE256 && Is256Bit) { // This combines the SVE versions of VSQXTUN/VSQXTUN2. // Upper VSQXTUN2 handling. // Doing upper first to ensure it doesn't get overwritten by lower calculation. const auto Mask = PRED_TMP_16B; sqxtunb(SubRegSize, VTMP2.Z(), VectorUpper.Z()); uzp1(SubRegSize, VTMP2.Z(), VTMP2.Z(), VTMP2.Z()); // Look at those implementations for details about this. // Lower VSQXTUN handling. sqxtunb(SubRegSize, Dst.Z(), VectorLower.Z()); uzp1(SubRegSize, Dst.Z(), Dst.Z(), Dst.Z()); // Merge. splice(SubRegSize, Dst.Z(), Mask, Dst.Z(), VTMP2.Z()); } else { if (OpSize == IR::OpSize::i64Bit) { zip1(ARMEmitter::SubRegSize::i64Bit, Dst.Q(), VectorLower.Q(), VectorUpper.Q()); sqxtun(SubRegSize, Dst, Dst); } else { if (Dst == VectorUpper) { // If the destination overlaps the upper then we need to move it temporarily. mov(VTMP1.Q(), VectorUpper.Q()); VectorUpper = VTMP1; } sqxtun(SubRegSize, Dst, VectorLower); sqxtun2(SubRegSize, Dst, VectorUpper); } } } DEF_OP(VSRSHR) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); const auto BitShift = Op->BitShift; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); // SVE SRSHR is destructive, so lets set up the destination // in the event we Dst and Vector don't alias. if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } srshr(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift); } else { if (OpSize == IR::OpSize::i64Bit) { srshr(SubRegSize, Dst.D(), Vector.D(), BitShift); } else { srshr(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } } } DEF_OP(VSQSHL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); const auto BitShift = Op->BitShift; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); // SVE SQSHL is destructive, so lets set up the destination // in the event Dst and Vector don't alias if (Dst != Vector) { movprfx(Dst.Z(), Vector.Z()); } sqshl(SubRegSize, Dst.Z(), Mask, Dst.Z(), BitShift); } else { if (OpSize == IR::OpSize::i64Bit) { sqshl(SubRegSize, Dst.D(), Vector.D(), BitShift); } else { sqshl(SubRegSize, Dst.Q(), Vector.Q(), BitShift); } } } DEF_OP(VMul) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto SubRegSize = ConvertSubRegSize16(IROp); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { mul(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } else { mul(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); } } DEF_OP(VUMull) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { umullb(SubRegSize, VTMP1.Z(), Vector1.Z(), Vector2.Z()); umullt(SubRegSize, VTMP2.Z(), Vector1.Z(), Vector2.Z()); zip1(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP2.Z()); } else { umull(SubRegSize, Dst.D(), Vector1.D(), Vector2.D()); } } DEF_OP(VSMull) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { smullb(SubRegSize, VTMP1.Z(), Vector1.Z(), Vector2.Z()); smullt(SubRegSize, VTMP2.Z(), Vector1.Z(), Vector2.Z()); zip1(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP2.Z()); } else { smull(SubRegSize, Dst.D(), Vector1.D(), Vector2.D()); } } DEF_OP(VUMull2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { umullb(SubRegSize, VTMP1.Z(), Vector1.Z(), Vector2.Z()); umullt(SubRegSize, VTMP2.Z(), Vector1.Z(), Vector2.Z()); zip2(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP2.Z()); } else { umull2(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); } } DEF_OP(VSMull2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { smullb(SubRegSize, VTMP1.Z(), Vector1.Z(), Vector2.Z()); smullt(SubRegSize, VTMP2.Z(), Vector1.Z(), Vector2.Z()); zip2(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP2.Z()); } else { smull2(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); } } DEF_OP(VUMulH) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); const auto SubRegSizeLarger = ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i16Bit : ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i32Bit : ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit; if (HostSupportsSVE256 && Is256Bit) { umulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } else if (HostSupportsSVE128 && Is128Bit) { if (HostSupportsSVE256) { // Do predicated to ensure upper-bits get zero as expected const auto Mask = PRED_TMP_16B.Merging(); if (Dst == Vector1) { umulh(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { umulh(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector1.Z()); } else { // Destination register doesn't overlap either source. // NOTE: SVE umulh (predicated) is a destructive operation. movprfx(Dst.Z(), Vector1.Z()); umulh(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } } else { umulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } } else if (OpSize == IR::OpSize::i64Bit) { umull(SubRegSizeLarger, Dst.D(), Vector1.D(), Vector2.D()); shrn(SubRegSize, Dst.D(), Dst.D(), IR::OpSizeAsBits(ElementSize)); } else { // ASIMD doesn't have a umulh. Need to emulate. umull2(SubRegSizeLarger, VTMP1.Q(), Vector1.Q(), Vector2.Q()); umull(SubRegSizeLarger, Dst.D(), Vector1.D(), Vector2.D()); uzp2(SubRegSize, Dst.Q(), Dst.Q(), VTMP1.Q()); } } DEF_OP(VSMulH) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize8(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); const auto SubRegSizeLarger = ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i16Bit : ElementSize == IR::OpSize::i16Bit ? ARMEmitter::SubRegSize::i32Bit : ElementSize == IR::OpSize::i32Bit ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit; if (HostSupportsSVE256 && Is256Bit) { smulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } else if (HostSupportsSVE128 && Is128Bit) { if (HostSupportsSVE256) { // Do predicated to ensure upper-bits get zero as expected const auto Mask = PRED_TMP_16B.Merging(); if (Dst == Vector1) { smulh(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } else if (Dst == Vector2) { smulh(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector1.Z()); } else { // Destination register doesn't overlap either source. // NOTE: SVE umulh (predicated) is a destructive operation. movprfx(Dst.Z(), Vector1.Z()); smulh(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z()); } } else { smulh(SubRegSize, Dst.Z(), Vector1.Z(), Vector2.Z()); } } else if (OpSize == IR::OpSize::i64Bit) { smull(SubRegSizeLarger, Dst.D(), Vector1.D(), Vector2.D()); shrn(SubRegSize, Dst.D(), Dst.D(), IR::OpSizeAsBits(ElementSize)); } else { // ASIMD doesn't have a umulh. Need to emulate. smull2(SubRegSizeLarger, VTMP1.Q(), Vector1.Q(), Vector2.Q()); smull(SubRegSizeLarger, Dst.D(), Vector1.D(), Vector2.D()); uzp2(SubRegSize, Dst.Q(), Dst.Q(), VTMP1.Q()); } } DEF_OP(VUABDL) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { // To mimic the behavior of AdvSIMD UABDL, we need to get the // absolute difference of the even elements (UADBLB), get the // absolute difference of the odd elemenets (UABDLT), then // interleave the results in both vectors together. uabdlb(SubRegSize, VTMP1.Z(), Vector1.Z(), Vector2.Z()); uabdlt(SubRegSize, VTMP2.Z(), Vector1.Z(), Vector2.Z()); zip1(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP2.Z()); } else { uabdl(SubRegSize, Dst.D(), Vector1.D(), Vector2.D()); } } DEF_OP(VUABDL2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); if (HostSupportsSVE256 && Is256Bit) { // To mimic the behavior of AdvSIMD UABDL, we need to get the // absolute difference of the even elements (UADBLB), get the // absolute difference of the odd elemenets (UABDLT), then // interleave the results in both vectors together. uabdlb(SubRegSize, VTMP1.Z(), Vector1.Z(), Vector2.Z()); uabdlt(SubRegSize, VTMP2.Z(), Vector1.Z(), Vector2.Z()); zip2(SubRegSize, Dst.Z(), VTMP1.Z(), VTMP2.Z()); } else { uabdl2(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q()); } } DEF_OP(VTBL1) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto VectorIndices = GetVReg(Op->VectorIndices); const auto VectorTable = GetVReg(Op->VectorTable); switch (OpSize) { case IR::OpSize::i64Bit: { tbl(Dst.D(), VectorTable.Q(), VectorIndices.D()); break; } case IR::OpSize::i128Bit: { tbl(Dst.Q(), VectorTable.Q(), VectorIndices.Q()); break; } case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); tbl(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), VectorTable.Z(), VectorIndices.Z()); break; } default: LOGMAN_MSG_A_FMT("Unknown OpSize: {}", OpSize); break; } } DEF_OP(VTBL2) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto VectorIndices = GetVReg(Op->VectorIndices); auto VectorTable1 = GetVReg(Op->VectorTable1); auto VectorTable2 = GetVReg(Op->VectorTable2); if (!ARMEmitter::AreVectorsSequential(VectorTable1, VectorTable2)) { // Vector registers aren't sequential, need to move to temporaries. if (OpSize == IR::OpSize::i256Bit) { mov(VTMP1.Z(), VectorTable1.Z()); mov(VTMP2.Z(), VectorTable2.Z()); } else { mov(VTMP1.Q(), VectorTable1.Q()); mov(VTMP2.Q(), VectorTable2.Q()); } static_assert(ARMEmitter::AreVectorsSequential(VTMP1, VTMP2), "VTMP1 and VTMP2 must be sequential in order to use double-table " "TBL"); VectorTable1 = VTMP1; VectorTable2 = VTMP2; } switch (OpSize) { case IR::OpSize::i64Bit: { tbl(Dst.D(), VectorTable1.Q(), VectorTable2.Q(), VectorIndices.D()); break; } case IR::OpSize::i128Bit: { tbl(Dst.Q(), VectorTable1.Q(), VectorTable2.Q(), VectorIndices.Q()); break; } case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); tbl(ARMEmitter::SubRegSize::i8Bit, Dst.Z(), VectorTable1.Z(), VectorTable2.Z(), VectorIndices.Z()); break; } default: LOGMAN_MSG_A_FMT("Unknown OpSize: {}", OpSize); break; } } DEF_OP(VTBX1) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto Dst = GetVReg(Node); const auto VectorSrcDst = GetVReg(Op->VectorSrcDst); const auto VectorIndices = GetVReg(Op->VectorIndices); const auto VectorTable = GetVReg(Op->VectorTable); if (Dst != VectorSrcDst) { switch (OpSize) { case IR::OpSize::i64Bit: { mov(VTMP1.D(), VectorSrcDst.D()); tbx(VTMP1.D(), VectorTable.Q(), VectorIndices.D()); mov(Dst.D(), VTMP1.D()); break; } case IR::OpSize::i128Bit: { mov(VTMP1.Q(), VectorSrcDst.Q()); tbx(VTMP1.Q(), VectorTable.Q(), VectorIndices.Q()); mov(Dst.Q(), VTMP1.Q()); break; } case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); mov(VTMP1.Z(), VectorSrcDst.Z()); tbx(ARMEmitter::SubRegSize::i8Bit, VTMP1.Z(), VectorTable.Z(), VectorIndices.Z()); mov(Dst.Z(), VTMP1.Z()); break; } default: LOGMAN_MSG_A_FMT("Unknown OpSize: {}", OpSize); break; } } else { switch (OpSize) { case IR::OpSize::i64Bit: { tbx(VectorSrcDst.D(), VectorTable.Q(), VectorIndices.D()); break; } case IR::OpSize::i128Bit: { tbx(VectorSrcDst.Q(), VectorTable.Q(), VectorIndices.Q()); break; } case IR::OpSize::i256Bit: { LOGMAN_THROW_A_FMT(HostSupportsSVE256, "Host does not support SVE. Cannot perform 256-bit table lookup"); tbx(ARMEmitter::SubRegSize::i8Bit, VectorSrcDst.Z(), VectorTable.Z(), VectorIndices.Z()); break; } default: LOGMAN_MSG_A_FMT("Unknown OpSize: {}", OpSize); break; } } } DEF_OP(VRev32) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); LOGMAN_THROW_A_FMT(ElementSize == IR::OpSize::i8Bit || ElementSize == IR::OpSize::i16Bit, "Invalid size"); const auto SubRegSize = ElementSize == IR::OpSize::i8Bit ? ARMEmitter::SubRegSize::i8Bit : ARMEmitter::SubRegSize::i16Bit; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); switch (ElementSize) { case IR::OpSize::i8Bit: { revb(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Mask, Vector.Z()); break; } case IR::OpSize::i16Bit: { revh(ARMEmitter::SubRegSize::i32Bit, Dst.Z(), Mask, Vector.Z()); break; } default: LOGMAN_MSG_A_FMT("Invalid Element Size: {}", ElementSize); break; } } else { if (OpSize == IR::OpSize::i64Bit) { rev32(SubRegSize, Dst.D(), Vector.D()); } else { rev32(SubRegSize, Dst.Q(), Vector.Q()); } } } DEF_OP(VRev64) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto ElementSize = Op->Header.ElementSize; const auto SubRegSize = ConvertSubRegSize4(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector = GetVReg(Op->Vector); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); switch (ElementSize) { case IR::OpSize::i8Bit: { revb(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } case IR::OpSize::i16Bit: { revh(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } case IR::OpSize::i32Bit: { revw(ARMEmitter::SubRegSize::i64Bit, Dst.Z(), Mask, Vector.Z()); break; } default: LOGMAN_MSG_A_FMT("Invalid Element Size: {}", ElementSize); break; } } else { if (OpSize == IR::OpSize::i64Bit) { rev64(SubRegSize, Dst.D(), Vector.D()); } else { rev64(SubRegSize, Dst.Q(), Vector.Q()); } } } DEF_OP(VFCADD) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); LOGMAN_THROW_A_FMT(Op->Rotate == 90 || Op->Rotate == 270, "Invalidate Rotate"); const auto Rotate = Op->Rotate == 90 ? ARMEmitter::Rotation::ROTATE_90 : ARMEmitter::Rotation::ROTATE_270; if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); if (Dst == Vector1) { // Trivial case where we already have first vector in the destination // register. We can just do the operation in place. fcadd(SubRegSize, Dst.Z(), Mask, Vector1.Z(), Vector2.Z(), Rotate); } else if (Dst == Vector2) { // SVE FCADD is a destructive operation, so we need // a temporary for performing operations. movprfx(VTMP1.Z(), Vector1.Z()); fcadd(SubRegSize, VTMP1.Z(), Mask, VTMP1.Z(), Vector2.Z(), Rotate); mov(Dst.Z(), VTMP1.Z()); } else { // We have no source/dest aliasing, so we can move into the destination. movprfx(Dst.Z(), Vector1.Z()); fcadd(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z(), Rotate); } } else { if (OpSize == IR::OpSize::i64Bit) { fcadd(SubRegSize, Dst.D(), Vector1.D(), Vector2.D(), Rotate); } else { fcadd(SubRegSize, Dst.Q(), Vector1.Q(), Vector2.Q(), Rotate); } } } DEF_OP(VFMLA) { ///< Dest = (Vector1 * Vector2) + Addend // Matches: // - SVE - FMLA // - ASIMD - FMLA // - Scalar - FMADD const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); const auto VectorAddend = GetVReg(Op->Addend); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Z(), VectorAddend.Z()); } fmla(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z()); if (Dst != DestTmp) { mov(Dst.Z(), DestTmp.Z()); } } else { if (IROp->ElementSize == OpSize) { if (IROp->ElementSize == IR::OpSize::i16Bit) { fmadd(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fmadd(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fmadd(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; } ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Q(), VectorAddend.Q()); } if (OpSize == IR::OpSize::i128Bit) { fmla(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q()); } else { fmla(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D()); } if (Dst != DestTmp) { mov(Dst.Q(), DestTmp.Q()); } } } DEF_OP(VFMLS) { ///< Dest = (Vector1 * Vector2) - Addend // Matches: // - SVE - FNMLS // - ASIMD - FMLA (With negated addend) // - Scalar - FNMSUB const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); const auto VectorAddend = GetVReg(Op->Addend); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Z(), VectorAddend.Z()); } fnmls(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z()); if (Dst != DestTmp) { mov(Dst.Z(), DestTmp.Z()); } } else if (HostSupportsSVE128 && Is128Bit) { const auto Mask = PRED_TMP_16B.Merging(); ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Z(), VectorAddend.Z()); } fnmls(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z()); if (Dst != DestTmp) { mov(Dst.Z(), DestTmp.Z()); } } else { if (IROp->ElementSize == OpSize) { if (IROp->ElementSize == IR::OpSize::i16Bit) { fnmsub(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fnmsub(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fnmsub(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; } // Addend needs to get negated to match correct behaviour here. ARMEmitter::VRegister DestTmp = Dst; if (Dst == Vector1 || Dst == Vector2) { DestTmp = VTMP1; } if (Is128Bit) { fneg(SubRegSize, DestTmp.Q(), VectorAddend.Q()); } else { fneg(SubRegSize, DestTmp.D(), VectorAddend.D()); } if (Is128Bit) { fmla(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q()); } else { fmla(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D()); } if (DestTmp != Dst) { if (Is128Bit) { mov(Dst.Q(), DestTmp.Q()); } else { mov(Dst.D(), DestTmp.D()); } } } } DEF_OP(VFNMLA) { ///< Dest = (-Vector1 * Vector2) + Addend // Matches: // - SVE - FMLS // - ASIMD - FMLS // - Scalar - FMSUB const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); const auto VectorAddend = GetVReg(Op->Addend); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Z(), VectorAddend.Z()); } fmls(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z()); if (Dst != DestTmp) { mov(Dst.Z(), DestTmp.Z()); } } else { if (IROp->ElementSize == OpSize) { if (IROp->ElementSize == IR::OpSize::i16Bit) { fmsub(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fmsub(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fmsub(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; } ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Q(), VectorAddend.Q()); } if (OpSize == IR::OpSize::i128Bit) { fmls(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q()); } else { fmls(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D()); } if (Dst != DestTmp) { mov(Dst.Q(), DestTmp.Q()); } } } DEF_OP(VFNMLS) { ///< Dest = (-Vector1 * Vector2) - Addend // Matches: // - SVE - FNMLA // - ASIMD - FMLS (With Negated addend) // - Scalar - FNMADD const auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); const auto Is256Bit = OpSize == IR::OpSize::i256Bit; LOGMAN_THROW_A_FMT(!Is256Bit || HostSupportsSVE256, "Need SVE256 support in order to use {} with 256-bit operation", __func__); const auto Is128Bit = OpSize == IR::OpSize::i128Bit; const auto Dst = GetVReg(Node); const auto Vector1 = GetVReg(Op->Vector1); const auto Vector2 = GetVReg(Op->Vector2); const auto VectorAddend = GetVReg(Op->Addend); if (HostSupportsSVE256 && Is256Bit) { const auto Mask = PRED_TMP_32B.Merging(); ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Z(), VectorAddend.Z()); } fnmla(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z()); if (Dst != DestTmp) { mov(Dst.Z(), DestTmp.Z()); } } else if (HostSupportsSVE128 && Is128Bit) { const auto Mask = PRED_TMP_16B.Merging(); ARMEmitter::VRegister DestTmp = Dst; if (Dst != VectorAddend) { if (Dst != Vector1 && Dst != Vector2) { DestTmp = Dst; } else { DestTmp = VTMP1; } mov(DestTmp.Z(), VectorAddend.Z()); } fnmla(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z()); if (Dst != DestTmp) { mov(Dst.Z(), DestTmp.Z()); } } else { if (IROp->ElementSize == OpSize) { if (IROp->ElementSize == IR::OpSize::i16Bit) { fnmadd(Dst.H(), Vector1.H(), Vector2.H(), VectorAddend.H()); } else if (IROp->ElementSize == IR::OpSize::i32Bit) { fnmadd(Dst.S(), Vector1.S(), Vector2.S(), VectorAddend.S()); } else if (IROp->ElementSize == IR::OpSize::i64Bit) { fnmadd(Dst.D(), Vector1.D(), Vector2.D(), VectorAddend.D()); } return; } // Addend needs to get negated to match correct behaviour here. ARMEmitter::VRegister DestTmp = Dst; if (Dst == Vector1 || Dst == Vector2) { DestTmp = VTMP1; } if (Is128Bit) { fneg(SubRegSize, DestTmp.Q(), VectorAddend.Q()); } else { fneg(SubRegSize, DestTmp.D(), VectorAddend.D()); } if (Is128Bit) { fmls(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q()); } else { fmls(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D()); } if (DestTmp != Dst) { if (Is128Bit) { mov(Dst.Q(), DestTmp.Q()); } else { mov(Dst.D(), DestTmp.D()); } } } } DEF_OP(VFCopySign) { auto Op = IROp->C(); const auto OpSize = IROp->Size; const auto SubRegSize = ConvertSubRegSize248(IROp); ARMEmitter::VRegister Magnitude = GetVReg(Op->Vector1); ARMEmitter::VRegister Sign = GetVReg(Op->Vector2); // We don't assign explicity to Dst but Dst and Magniture are tied to the same register. // Similar in semantics to C's copysignf. switch (OpSize) { case IR::OpSize::i64Bit: movi(SubRegSize, VTMP1.D(), 0x80, 24); bit(Magnitude.D(), Sign.D(), VTMP1.D()); break; case IR::OpSize::i128Bit: movi(SubRegSize, VTMP1.Q(), 0x80, 24); bit(Magnitude.Q(), Sign.Q(), VTMP1.Q()); break; default: LOGMAN_MSG_A_FMT("Unsupported element size for operation {}", __func__); FEX_UNREACHABLE; } } DEF_OP(F64SIN) { const auto Op = IROp->C(); const auto Src = GetVReg(Op->Src); const auto Dst = GetVReg(Node); fmov(VTMP1.D(), Src.D()); ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.F64SinHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); fmov(Dst.D(), VTMP1.D()); } DEF_OP(F64COS) { const auto Op = IROp->C(); const auto Src = GetVReg(Op->Src); const auto Dst = GetVReg(Node); fmov(VTMP1.D(), Src.D()); ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.F64CosHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); fmov(Dst.D(), VTMP1.D()); } DEF_OP(F64TAN) { const auto Op = IROp->C(); const auto Src = GetVReg(Op->Src); const auto Dst = GetVReg(Node); fmov(VTMP1.D(), Src.D()); ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.F64TanHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); blr(TMP1); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); fmov(Dst.D(), VTMP1.D()); } } // namespace FEXCore::CPU ================================================ FILE: FEXCore/Source/Interface/Core/LookupCache.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: glue|block-database desc: Stores information about blocks, and provides C++ implementations to lookup the blocks $end_info$ */ #include #include #include #include "Interface/Context/Context.h" #include "Interface/Core/LookupCache.h" namespace FEXCore { GuestToHostMap::GuestToHostMap() : BlockLinks_mbr {"FEXMem_BlockLinks"} { BlockLinks_pma = fextl::make_unique>(&BlockLinks_mbr); // Setup our PMR map. BlockLinks = BlockLinks_pma->new_object(); } LookupCache::LookupCache(FEXCore::Context::ContextImpl* CTX) : ctx {CTX} { TotalCacheSize = ctx->Config.VirtualMemSize / FEXCore::Utils::FEX_PAGE_SIZE * 8 + CODE_SIZE + MAX_L1_SIZE; // Block cache ends up looking like this // PageMemoryMap[VirtualMemoryRegion >> 12] // | // v // PageMemory[Memory & (VIRTUAL_PAGE_SIZE - 1)] // | // v // Pointer to Code // // Allocate a region of memory that we can use to back our block pointers // We need one pointer per page of virtual memory // At 64GB of virtual memory this will allocate 128MB of virtual memory space PagePointer = reinterpret_cast(FEXCore::Allocator::VirtualAlloc(TotalCacheSize, false, false)); LOGMAN_THROW_A_FMT(PagePointer != -1ULL, "Failed to allocate PagePointer"); // Disable THP on the Lookup cache. FEXCore::Allocator::VirtualTHPControl(reinterpret_cast(PagePointer), TotalCacheSize, FEXCore::Allocator::THPControl::Disable); FEXCore::Allocator::VirtualName("FEXMem_Lookup", reinterpret_cast(PagePointer), ctx->Config.VirtualMemSize / FEXCore::Utils::FEX_PAGE_SIZE * 8 + CODE_SIZE); CTX->SyscallHandler->MarkOvercommitRange(PagePointer, TotalCacheSize); // Allocate our memory backing our pages // We need 32KB per guest page (One pointer per byte) // XXX: We can drop down to 16KB if we store 4byte offsets from the code base // We currently limit to 128MB of real memory for caching for the total cache size. // Can end up being inefficient if we compile a small number of blocks per page PageMemory = PagePointer + ctx->Config.VirtualMemSize / FEXCore::Utils::FEX_PAGE_SIZE * 8; // L1 Cache L1Pointer = PageMemory + CODE_SIZE; FEXCore::Allocator::VirtualName("FEXMem_Lookup_L1", reinterpret_cast(L1Pointer), MAX_L1_SIZE); VirtualMemSize = ctx->Config.VirtualMemSize; if (DynamicL1Cache()) { // Start at minimum size when dynamic. L1PointerMask = MIN_L1_ENTRIES - 1; } else { // Start at maximum instead. L1PointerMask = MAX_L1_ENTRIES - 1; } } LookupCache::~LookupCache() { FEXCore::Allocator::VirtualFree(reinterpret_cast(PagePointer), TotalCacheSize); ctx->SyscallHandler->UnmarkOvercommitRange(PagePointer, TotalCacheSize); // No need to free BlockLinks map. // These will get freed when their memory allocators are deallocated. } void LookupCache::ClearL2Cache(const FEXCore::LookupCacheBaseLockToken& lk) { // Clear out the page memory // PagePointer and PageMemory are sequential with each other. Clear both at once. FEXCore::Allocator::VirtualDontNeed(reinterpret_cast(PagePointer), ctx->Config.VirtualMemSize / FEXCore::Utils::FEX_PAGE_SIZE * 8 + CODE_SIZE, false); AllocateOffset = 0; } void LookupCache::ClearThreadLocalCaches(const LookupCacheWriteLockToken&) { // Clear L1 and L2 by clearing the full cache. FEXCore::Allocator::VirtualDontNeed(reinterpret_cast(PagePointer), TotalCacheSize, false); CachedCodePages.clear(); } void LookupCache::ClearCache(const LookupCacheWriteLockToken& lk) { // Clear L1 and L2 by clearing the full cache. ClearThreadLocalCaches(lk); Shared->ClearCache(lk); } void GuestToHostMap::ClearCache(const LookupCacheWriteLockToken&) { // Allocate a new pointer from the BlockLinks pma again. BlockLinks = BlockLinks_pma->new_object(); // All code is gone, clear the block list BlockList.clear(); } } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Core/LookupCache.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Context/Context.h" #include #include #include "Utils/WritePriorityMutex.h" #include #include #include #include #include #include #include #include #include #include namespace FEXCore { struct LookupCacheBaseLockToken { protected: // Protected constructor - only derived classes can construct LookupCacheBaseLockToken() = default; }; struct LookupCacheWriteLockToken : public LookupCacheBaseLockToken { private: // Only constructible by GuestToHostMap friend struct GuestToHostMap; LookupCacheWriteLockToken(FEXCore::Utils::WritePriorityMutex::Mutex& Mutex) : Lock {Mutex} {} std::lock_guard Lock; }; struct LookupCacheReadLockToken : public LookupCacheBaseLockToken { private: // Only constructible by GuestToHostMap friend struct GuestToHostMap; LookupCacheReadLockToken(FEXCore::Utils::WritePriorityMutex::Mutex& Mutex) : Lock {Mutex} {} std::shared_lock Lock; }; struct GuestToHostMap { FEXCore::Utils::WritePriorityMutex::Mutex Lock {}; [[nodiscard]] LookupCacheWriteLockToken AcquireWriteLock() { return LookupCacheWriteLockToken {Lock}; } [[nodiscard]] LookupCacheReadLockToken AcquireReadLock() { return LookupCacheReadLockToken {Lock}; } struct BlockLinkTag { uint64_t GuestDestination; FEXCore::Context::ExitFunctionLinkData* HostLink; bool operator<(const BlockLinkTag& other) const { if (GuestDestination < other.GuestDestination) { return true; } else if (GuestDestination == other.GuestDestination) { return HostLink < other.HostLink; } else { return false; } } }; // Use a monotonic buffer resource to allocate both the std::pmr::map and its members. // This allows us to quickly clear the block link map by clearing the monotonic allocator. // If we had allocated the block link map without the MBR, then clearing the map would require slowly // walking each block member and destructing objects. // // This makes `BlockLinks` look like a raw pointer that could memory leak, but since it is backed by the MBR, it won't. fextl::pmr::named_monotonic_page_buffer_resource BlockLinks_mbr; using BlockLinksMapType = std::pmr::map; fextl::unique_ptr> BlockLinks_pma; BlockLinksMapType* BlockLinks; struct BlockEntry { uint64_t HostCode; fextl::vector CodePages; }; fextl::robin_map BlockList; fextl::map> CodePages; GuestToHostMap(); // Adds to Guest -> Host code mapping const BlockEntry& AddBlockMapping(uint64_t Address, const fextl::vector& CodePages, void* HostCode, const LookupCacheWriteLockToken&) { // This may replace an existing mapping // NOTE: Generally no previous entry should exist, however there is one exception: // If the backend updates the active thread's CodeBuffer, the new associated LookupCache // may already contain the block address. Since is comparatively rare, we'll just leak // one of the two blocks in this case. return BlockList.insert_or_assign(Address, BlockEntry {(uintptr_t)HostCode, CodePages}).first->second; } const BlockEntry* FindBlock(uint64_t Address, const LookupCacheReadLockToken&) { auto HostCode = BlockList.find(Address); if (HostCode == BlockList.end()) { return nullptr; } return &HostCode->second; } bool Erase(uint64_t Address, const LookupCacheWriteLockToken&) { // Sever any links to this block auto lower = BlockLinks->lower_bound({Address, nullptr}); auto upper = BlockLinks->upper_bound({Address, reinterpret_cast(UINTPTR_MAX)}); for (auto it = lower; it != upper; it = BlockLinks->erase(it)) { it->second(it->first.HostLink); } // Remove from BlockList return BlockList.erase(Address) != 0; } void InvalidateRange(uint64_t Start, uint64_t Length) { auto lk = AcquireWriteLock(); auto lower = CodePages.lower_bound(Start >> 12); auto upper = CodePages.upper_bound((Start + Length - 1) >> 12); for (auto it = lower; it != upper; it++) { for (const auto& Entry : it->second) { Erase(Entry, lk); } } CodePages.erase(lower, upper); } void AddBlockLink(uint64_t GuestDestination, FEXCore::Context::ExitFunctionLinkData* HostLink, const FEXCore::Context::BlockDelinkerFunc& delinker, const LookupCacheWriteLockToken&) { BlockLinks->insert({{GuestDestination, HostLink}, delinker}); } bool AddBlockExecutableRange(const std::ranges::input_range auto& Addresses, uint64_t Start, uint64_t Length, const LookupCacheWriteLockToken&) { bool rv = false; for (auto CurrentPage = Start >> 12, EndPage = (Start + Length - 1) >> 12; CurrentPage <= EndPage; CurrentPage++) { auto& CodePage = CodePages[CurrentPage]; rv |= CodePage.empty(); CodePage.insert(CodePage.end(), Addresses.begin(), Addresses.end()); } return rv; } void ClearCache(const LookupCacheWriteLockToken&); }; class LookupCache { public: struct LookupCacheEntry { uintptr_t HostCode; uintptr_t GuestCode; }; LookupCache(FEXCore::Context::ContextImpl* CTX); ~LookupCache(); // Swaps out the underlying GuestToHostMap and clears all associated caches. // This interface requires the previous CodeBuffer to be provided despite not using it. This ensures the shared write lock is still valid. void ChangeGuestToHostMapping([[maybe_unused]] CPU::CodeBuffer& Prev, GuestToHostMap& NewMap, const LookupCacheWriteLockToken& lk) { ClearThreadLocalCaches(lk); Shared = &NewMap; } uintptr_t FindBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address) { // Try L1, no lock needed auto& L1Entry = reinterpret_cast(L1Pointer)[Address & L1PointerMask]; if (L1Entry.GuestCode == Address) { return L1Entry.HostCode; } // L2 and L3 need to be locked uintptr_t HostPtr {}; { std::optional> LockTime( Thread->ThreadStats ? &Thread->ThreadStats->AccumulatedCacheReadLockTime : nullptr); auto lk = Shared->AcquireReadLock(); LockTime.reset(); if (!DisableL2Cache()) { // Try L2 const auto PageIndex = (Address & (VirtualMemSize - 1)) >> 12; const auto PageOffset = Address & (0x0FFF); const auto Pointers = reinterpret_cast(PagePointer); auto LocalPagePointer = Pointers[PageIndex]; // Do we a page pointer for this address? if (LocalPagePointer) { // Find there pointer for the address in the blocks auto BlockPointers = reinterpret_cast(LocalPagePointer); if (BlockPointers[PageOffset].GuestCode == Address) { L1Entry.GuestCode = Address; L1Entry.HostCode = BlockPointers[PageOffset].HostCode; HostPtr = L1Entry.HostCode; } } } if (!HostPtr) { // Try L3 auto Entry = Shared->FindBlock(Address, lk); if (Entry) { CacheBlockMapping(Address, *Entry, false, lk); HostPtr = Entry->HostCode; } } } if (HostPtr && DynamicL1Cache()) { UpdateDynamicL1Stats(Thread); } FEXCORE_PROFILE_INSTANT_INCREMENT(Thread, AccumulatedCacheMissCount, 1); return HostPtr; } void UpdateDynamicL1Stats(FEXCore::Core::InternalThreadState* Thread) { // If host pointer was found in L2 or L3, then add it to the counter. // Keeping track not L1 misses, but specifically L2/L3 hits. ++L2L3CacheHits; const auto CurrentTime = std::chrono::system_clock::now(); const auto Period = CurrentTime - LastPeriod; if (Period >= SamplePeriod) { // If larger than the sample period then check if we need to increase L1 cache size. const double AveragePerSecond = static_cast(L2L3CacheHits) / static_cast(std::chrono::duration_cast(Period).count()) * 1000.0; if (AveragePerSecond >= DynamicL1CacheIncreaseCountHeuristic()) { if (CurrentL1Entries < MAX_L1_ENTRIES) { CurrentL1Entries <<= 1; L1PointerMask = CurrentL1Entries - 1; // Update the thread's L1 pointer mask to increase how much cache it uses. // Since we're in C-code, this is safe to update here. Thread->CurrentFrame->State.L1Mask = GetScaledL1PointerMask(); } } else if (AveragePerSecond < DynamicL1CacheDecreaseCountHeuristic()) { if (CurrentL1Entries > MIN_L1_ENTRIES) { CurrentL1Entries >>= 1; L1PointerMask = CurrentL1Entries - 1; // Madvise the entries that we are dropping. Gives the memory back to the OS. LookupCacheEntry* FirstZeroL1Entry = &reinterpret_cast(L1Pointer)[CurrentL1Entries]; size_t ZeroMemorySize = (MAX_L1_ENTRIES - CurrentL1Entries) * sizeof(LookupCacheEntry); FEXCore::Allocator::VirtualDontNeed(FirstZeroL1Entry, ZeroMemorySize, false); // Update the thread's L1 pointer mask to increase how much cache it uses. // Since we're in C-code, this is safe to update here. Thread->CurrentFrame->State.L1Mask = GetScaledL1PointerMask(); } } // Update Last period to start again. LastPeriod = CurrentTime; L2L3CacheHits = 0; } } GuestToHostMap* Shared = nullptr; // Appends a list of Block {Address} to CodePages [Start, Start + Length) // Returns true if new pages are marked as containing code bool AddBlockExecutableRange(FEXCore::Core::InternalThreadState* Thread, const fextl::set& Addresses, uint64_t Start, uint64_t Length) { std::optional> LockTime( Thread->ThreadStats ? &Thread->ThreadStats->AccumulatedCacheWriteLockTime : nullptr); auto lk = Shared->AcquireWriteLock(); LockTime.reset(); return Shared->AddBlockExecutableRange(Addresses, Start, Length, lk); } // Adds to Guest -> Host code mapping void AddBlockMapping(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, const fextl::vector& CodePages, void* HostCode) { std::optional> LockTime( Thread->ThreadStats ? &Thread->ThreadStats->AccumulatedCacheWriteLockTime : nullptr); auto lk = Shared->AcquireWriteLock(); LockTime.reset(); const auto& Entry = Shared->AddBlockMapping(Address, CodePages, HostCode, lk); // There is no need to update L1 or L2, they will get updated on first lookup // However, adding to L1 here increases performance CacheBlockMapping(Address, Entry, true, lk); } // Invalidates L1/L2 for a given guest block void InvalidateCache(uint64_t Address, const LookupCacheWriteLockToken& lk) { // Do L1 auto& L1Entry = reinterpret_cast(L1Pointer)[Address & L1PointerMask]; if (L1Entry.GuestCode == Address) { L1Entry.GuestCode = 0; // Leave L1Entry.HostCode as is, so that concurrent lookups won't read a null pointer // This is a soft guarantee for cross thread invalidation, as atomics are not used // and it hasn't been thoroughly tested } if (!DisableL2Cache()) { // Do full map Address = Address & (VirtualMemSize - 1); uint64_t PageOffset = Address & (0x0FFF); Address >>= 12; uintptr_t* Pointers = reinterpret_cast(PagePointer); uint64_t LocalPagePointer = Pointers[Address]; if (!LocalPagePointer) { // Page for this code didn't even exist, nothing to do return; } // Page exists, just set the offset to zero auto BlockPointers = reinterpret_cast(LocalPagePointer); BlockPointers[PageOffset].GuestCode = 0; BlockPointers[PageOffset].HostCode = 0; } } // Invalidates all L1/L2 entries for all guest block that intersect the given range bool InvalidateCacheRange(uint64_t Start, uint64_t Length) { auto lk = Shared->AcquireWriteLock(); auto lower = CachedCodePages.lower_bound(Start >> 12); auto upper = CachedCodePages.upper_bound((Start + Length - 1) >> 12); for (auto it = lower; it != upper; it++) { for (const auto& Entry : it->second) { InvalidateCache(Entry, lk); } } bool ret = upper != lower; CachedCodePages.erase(lower, upper); return ret; } void AddBlockLink(uint64_t GuestDestination, FEXCore::Context::ExitFunctionLinkData* HostLink, const FEXCore::Context::BlockDelinkerFunc& delinker, const LookupCacheWriteLockToken& lk) { Shared->AddBlockLink(GuestDestination, HostLink, delinker, lk); } void ClearCache(const LookupCacheWriteLockToken&); void ClearL2Cache(const LookupCacheBaseLockToken&); void ClearThreadLocalCaches(const LookupCacheWriteLockToken&); uintptr_t GetL1Pointer() const { return L1Pointer; } uintptr_t GetScaledL1PointerMask() const { return L1PointerMask << FEXCore::ilog2(sizeof(LookupCache::LookupCacheEntry)); } uintptr_t GetPagePointer() const { return PagePointer; } uintptr_t GetVirtualMemorySize() const { return VirtualMemSize; } // This needs to be taken before reads or writes to L2, L3, CodePages, // and before writes to L1. Concurrent access from a thread that this LookupCache doesn't belong to // may only happen during cross thread invalidation (::Erase). // All other operations must be done from the owning thread. // Some care is taken so that L1 lookups can be done without locks, and even tearing is unlikely to lead to a crash. // This approach has not been fully vetted yet. // Also note that L1 lookups might be inlined in the JIT Dispatcher and/or block ends. auto AcquireWriteLock() { return Shared->AcquireWriteLock(); } private: void CacheBlockMapping(uint64_t Address, const GuestToHostMap::BlockEntry& Entry, bool L1Only, const LookupCacheBaseLockToken& lk) { for (const auto& CodePage : Entry.CodePages) { CachedCodePages[CodePage >> 12].insert(Address); } // Do L1 auto& L1Entry = reinterpret_cast(L1Pointer)[Address & L1PointerMask]; L1Entry.GuestCode = Address; L1Entry.HostCode = Entry.HostCode; if (!DisableL2Cache() && !L1Only) { // Do ful map auto FullAddress = Address; Address = Address & (VirtualMemSize - 1); uint64_t PageOffset = Address & (0x0FFF); Address >>= 12; uintptr_t* Pointers = reinterpret_cast(PagePointer); uint64_t LocalPagePointer = Pointers[Address]; if (!LocalPagePointer) { // We don't have a page pointer for this address // Allocate one now if we can uintptr_t NewPageBacking = AllocateBackingForPage(); if (!NewPageBacking) { // Couldn't allocate, clear L2 and retry ClearL2Cache(lk); CacheBlockMapping(FullAddress, Entry, false, lk); return; } Pointers[Address] = NewPageBacking; LocalPagePointer = NewPageBacking; } // Add the new pointer to the page block auto BlockPointers = reinterpret_cast(LocalPagePointer); // This silently replaces existing mappings BlockPointers[PageOffset].GuestCode = FullAddress; BlockPointers[PageOffset].HostCode = Entry.HostCode; } } uintptr_t AllocateBackingForPage() { uintptr_t NewBase = AllocateOffset; uintptr_t NewEnd = AllocateOffset + SIZE_PER_PAGE; if (NewEnd >= CODE_SIZE) { // We ran out of block backing space. Need to clear the block cache and tell the JIT cores to clear their caches as well // Tell whatever is calling this that it needs to do it. return 0; } AllocateOffset = NewEnd; return PageMemory + NewBase; } // Maps from a page index to all blocks in the page that have at some point been fetched into L1/L2 fextl::map> CachedCodePages; uintptr_t PagePointer; uintptr_t PageMemory; uintptr_t L1Pointer; uintptr_t L1PointerMask; size_t TotalCacheSize; // Start with 8k entries in L1 to give 128KB of L1 cache to each thread. // Max out at 1 million entries to give each thread 16MB of L1 cache maximum. constexpr static size_t MIN_L1_ENTRIES = 8 * 1024; // Must be a power of 2 constexpr static size_t MAX_L1_ENTRIES = 1 * 1024 * 1024; // Must be a power of 2 constexpr static size_t CODE_SIZE = 128 * 1024 * 1024; constexpr static size_t SIZE_PER_PAGE = FEXCore::Utils::FEX_PAGE_SIZE * sizeof(LookupCacheEntry); constexpr static size_t MAX_L1_SIZE = MAX_L1_ENTRIES * sizeof(LookupCacheEntry); size_t AllocateOffset {}; FEXCore::Context::ContextImpl* ctx; uint64_t VirtualMemSize {}; size_t CurrentL1Entries = MIN_L1_ENTRIES; uint64_t L2L3CacheHits {}; std::chrono::time_point LastPeriod {}; constexpr static std::chrono::seconds SamplePeriod {1}; FEX_CONFIG_OPT(DynamicL1CacheIncreaseCountHeuristic, DYNAMICL1CACHEINCREASECOUNTHEURISTIC); FEX_CONFIG_OPT(DynamicL1CacheDecreaseCountHeuristic, DYNAMICL1CACHEDECREASECOUNTHEURISTIC); FEX_CONFIG_OPT(DynamicL1Cache, DYNAMICL1CACHE); FEX_CONFIG_OPT(DisableL2Cache, DISABLEL2CACHE); }; } // namespace FEXCore ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 AVX instructions to 128-bit IR $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include #include "Interface/Core/OpcodeDispatcher.h" #include #include #include #include namespace FEXCore::IR { class OrderedNode; #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_LoadSource_WithOpSize( const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh, MemoryAccessType AccessType) { if (Operand.IsGPR()) { const auto gpr = Operand.Data.GPR.GPR; LOGMAN_THROW_A_FMT(gpr >= FEXCore::X86State::REG_XMM_0 && gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg"); const auto gprIndex = gpr - X86State::REG_XMM_0; return { .Low = AVX128_LoadXMMRegister(gprIndex, false), .High = NeedsHigh ? AVX128_LoadXMMRegister(gprIndex, true) : nullptr, }; } else { LOGMAN_THROW_A_FMT(IsOperandMem(Operand, true), "only memory sources"); if (Operand.IsSIB()) { const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0; LOGMAN_THROW_A_FMT(!IsVSIB, "VSIB uses LoadVSIB instead"); } const AddressMode A = DecodeAddress(Op, Operand, AccessType, true /* IsLoad */); if (NeedsHigh) { return _LoadMemPairFPRAutoTSO(OpSize::i128Bit, A, OpSize::i8Bit); } else { return {.Low = _LoadMemFPRAutoTSO(OpSize::i128Bit, A, OpSize::i8Bit)}; } } } OpDispatchBuilder::RefVSIB OpDispatchBuilder::AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh) { const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0; LOGMAN_THROW_A_FMT((Operand.IsSIB() || Operand.IsSIBRelocation()) && IsVSIB, "Trying to load VSIB for something that isn't the correct " "type!"); // VSIB is a very special case which has a ton of encoded data. // Get it in a format we can reason about. const auto Index_gpr = Operand.Data.SIB.Index; const auto Base_gpr = Operand.Data.SIB.Base; LOGMAN_THROW_A_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg"); LOGMAN_THROW_A_FMT(Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15), "Base must be a GPR."); const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0; OpDispatchBuilder::RefVSIB A { .Low = AVX128_LoadXMMRegister(Index_XMM_gpr, false), .High = NeedsHigh ? AVX128_LoadXMMRegister(Index_XMM_gpr, true) : Invalid(), .BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr, .Scale = Operand.Data.SIB.Scale, }; if (Operand.IsSIBRelocation()) { auto EPOffset = _EntrypointOffset(OpSize::i64Bit, Operand.Data.SIB.Offset); if (A.BaseAddr) { A.BaseAddr = Add(OpSize::i64Bit, EPOffset, A.BaseAddr); } else { A.BaseAddr = EPOffset; } } else { A.Displacement = static_cast(Operand.Data.SIB.Offset); } return A; } void OpDispatchBuilder::AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand, const RefPair Src, MemoryAccessType AccessType) { if (Operand.IsGPR()) { const auto gpr = Operand.Data.GPR.GPR; LOGMAN_THROW_A_FMT(gpr >= FEXCore::X86State::REG_XMM_0 && gpr <= FEXCore::X86State::REG_XMM_15, "expected AVX register"); const auto gprIndex = gpr - X86State::REG_XMM_0; if (Src.Low) { AVX128_StoreXMMRegister(gprIndex, Src.Low, false); } if (Src.High) { AVX128_StoreXMMRegister(gprIndex, Src.High, true); } } else { AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */); if (Src.High) { _StoreMemPairFPRAutoTSO(OpSize::i128Bit, A, Src.Low, Src.High, OpSize::i8Bit); } else { _StoreMemFPRAutoTSO(OpSize::i128Bit, A, Src.Low, OpSize::i8Bit); } } } Ref OpDispatchBuilder::AVX128_LoadXMMRegister(uint32_t XMM, bool High) { if (High) { return LoadContext(AVXHigh0Index + XMM); } else { return LoadXMMRegister(XMM); } } void OpDispatchBuilder::AVX128_StoreXMMRegister(uint32_t XMM, const Ref Src, bool High) { if (High) { StoreContext(AVXHigh0Index + XMM, Src); } else { StoreXMMRegister(XMM, Src); } } void OpDispatchBuilder::AVX128_VMOVAPS(OpcodeArgs) { // Reg <- Mem or Reg <- Reg const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; if (Is128Bit) { // Zero upper 128-bits auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); ///< Zero upper bits when destination is GPR. if (Op->Dest.IsGPR()) { Src.High = LoadZeroVector(OpSize::i128Bit); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src); } else { // Copy or memory load auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src); } } void OpDispatchBuilder::AVX128_VMOVScalarImpl(OpcodeArgs, IR::OpSize ElementSize) { if (Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Src[1].IsGPR()) { // VMOVSS/SD xmm1, xmm2, xmm3 // Lower 128-bits are merged // Upper 128-bits are zero'd auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); Ref Result = _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Src1.Low, Src2.Low); auto High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result, .High = High}); } else if (Op->Dest.IsGPR()) { // VMOVSS/SD xmm1, mem32/mem64 Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[1], ElementSize, Op->Flags); auto High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Src, .High = High}); } else { // VMOVSS/SD mem32/mem64, xmm1 auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); StoreResultFPR_WithOpSize(Op, Op->Dest, Src.Low, ElementSize); } } void OpDispatchBuilder::AVX128_VMOVSD(OpcodeArgs) { AVX128_VMOVScalarImpl(Op, OpSize::i64Bit); } void OpDispatchBuilder::AVX128_VMOVSS(OpcodeArgs) { AVX128_VMOVScalarImpl(Op, OpSize::i32Bit); } void OpDispatchBuilder::AVX128_VectorALU(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); DeriveOp(Result_Low, IROp, _VAdd(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low)); if (Is128Bit) { auto High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = High}); } else { DeriveOp(Result_High, IROp, _VAdd(OpSize::i128Bit, ElementSize, Src1.High, Src2.High)); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = Result_High}); } } void OpDispatchBuilder::AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); DeriveOp(Result_Low, IROp, _VFSqrt(OpSize::i128Bit, ElementSize, Src.Low)); if (Is128Bit) { auto High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = High}); } else { DeriveOp(Result_High, IROp, _VFSqrt(OpSize::i128Bit, ElementSize, Src.High)); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = Result_High}); } } void OpDispatchBuilder::AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper) { const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); RefPair Result {}; Result.Low = Helper(ElementSize, Src.Low); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { Result.High = Helper(ElementSize, Src.High); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper) { const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Result {}; Result.Low = Helper(ElementSize, Src1.Low, Src2.Low); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { Result.High = Helper(ElementSize, Src1.High, Src2.High); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3, std::function Helper) { const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Result {}; Result.Low = Helper(ElementSize, Src1.Low, Src2.Low, Src3); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { Result.High = Helper(ElementSize, Src1.High, Src2.High, Src3); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VectorShiftWideImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp) { const auto Is128Bit = GetSrcSize(Op) == Core::CPUState::XMM_SSE_REG_SIZE; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); // Incoming element size for the shift source is always 8-bytes in the lower register. DeriveOp(Low, IROp, _VUShrSWide(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low)); RefPair Result {}; Result.Low = Low; if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { DeriveOp(High, IROp, _VUShrSWide(OpSize::i128Bit, ElementSize, Src1.High, Src2.Low)); Result.High = High; } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VectorShiftImmImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; const uint64_t ShiftConstant = Op->Src[1].Literal(); auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); RefPair Result {}; if (ShiftConstant == 0) [[unlikely]] { Result = Src; } else { DeriveOp(Low, IROp, _VUShrI(OpSize::i128Bit, ElementSize, Src.Low, ShiftConstant)); Result.Low = Low; if (!Is128Bit) { DeriveOp(High, IROp, _VUShrI(OpSize::i128Bit, ElementSize, Src.High, ShiftConstant)); Result.High = High; } } if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VectorXOR(OpcodeArgs) { // Special case for vector xor with itself being the optimal way for x86 to zero vector registers. if (Op->Src[0].IsGPR() && Op->Src[1].IsGPR() && Op->Src[0].Data.GPR.GPR == Op->Src[1].Data.GPR.GPR) { AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(LoadZeroVector(OpSize::i128Bit))); return; } ///< Regular code path AVX128_VectorALU(Op, OP_VXOR, OpSize::i128Bit); } void OpDispatchBuilder::AVX128_VZERO(OpcodeArgs) { const auto DstSize = GetDstSize(Op); const auto IsVZEROALL = DstSize == Core::CPUState::XMM_AVX_REG_SIZE; const auto NumRegs = Is64BitMode ? 16U : 8U; if (IsVZEROALL) { // NOTE: Despite the name being VZEROALL, this will still only ever // zero out up to the first 16 registers (even on AVX-512, where we have 32 registers) Ref ZeroVector {}; for (uint32_t i = 0; i < NumRegs; i++) { // Explicitly not caching named vector zero. This ensures that every register gets movi #0.0 directly. ZeroVector = LoadUncachedZeroVector(OpSize::i128Bit); AVX128_StoreXMMRegister(i, ZeroVector, false); } InvalidateHighAVXRegisters(); _ContextClear(offsetof(FEXCore::Core::CPUState, avx_high), sizeof(FEXCore::Core::CPUState::avx_high[0]) * NumRegs); } else { // Likewise, VZEROUPPER will only ever zero only up to the first 16 registers InvalidateHighAVXRegisters(); _ContextClear(offsetof(FEXCore::Core::CPUState, avx_high), sizeof(FEXCore::Core::CPUState::avx_high[0]) * NumRegs); } } void OpDispatchBuilder::AVX128_MOVVectorNT(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; if (Op->Dest.IsGPR()) { ///< MOVNTDQA load non-temporal comes from SSE4.1 and is extended by AVX/AVX2. RefPair Src {}; Ref SrcAddr = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.LoadData = false}); Src.Low = _VLoadNonTemporal(OpSize::i128Bit, SrcAddr, 0); if (Is128Bit) { Src.High = LoadZeroVector(OpSize::i128Bit); } else { Src.High = _VLoadNonTemporal(OpSize::i128Bit, SrcAddr, 16); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src); } else { auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit, MemoryAccessType::STREAM); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); if (Is128Bit) { // Single store non-temporal for 128-bit operations. _VStoreNonTemporal(OpSize::i128Bit, Src.Low, Dest, 0); } else { // For a 256-bit store, use a non-temporal store pair _VStoreNonTemporalPair(OpSize::i128Bit, Src.Low, Src.High, Dest, 0); } } } void OpDispatchBuilder::AVX128_MOVQ(OpcodeArgs) { RefPair Src {}; if (Op->Src[0].IsGPR()) { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); } else { Src.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], OpSize::i64Bit, Op->Flags); } // This instruction is a bit special that if the destination is a register then it'll ZEXT the 64bit source to 256bit if (Op->Dest.IsGPR()) { // Zero bits [127:64] as well. Src.Low = VZeroExtendOperand(OpSize::i64Bit, Op->Src[0], Src.Low); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); Src.High = ZeroVector; AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src); } else { StoreResultFPR_WithOpSize(Op, Op->Dest, Src.Low, OpSize::i64Bit, OpSize::i64Bit); } } void OpDispatchBuilder::AVX128_VMOVLP(OpcodeArgs) { auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); if (!Op->Dest.IsGPR()) { ///< VMOVLPS/PD mem64, xmm1 StoreResultFPR_WithOpSize(Op, Op->Dest, Src1.Low, OpSize::i64Bit, OpSize::i64Bit); } else if (!Op->Src[1].IsGPR()) { ///< VMOVLPS/PD xmm1, xmm2, mem64 // Bits[63:0] come from Src2[63:0] // Bits[127:64] come from Src1[127:64] auto Src2 = MakeSegmentAddress(Op, Op->Src[1]); Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 0, Src2); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } else { ///< VMOVHLPS/PD xmm1, xmm2, xmm3 auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); Ref Result_Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 1, Src1.Low, Src2.Low); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } } void OpDispatchBuilder::AVX128_VMOVHP(OpcodeArgs) { auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); if (!Op->Dest.IsGPR()) { ///< VMOVHPS/PD mem64, xmm1 // Need to store Bits[127:64]. Use a vector element store. auto Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Dest); } else if (!Op->Src[1].IsGPR()) { ///< VMOVHPS/PD xmm2, xmm1, mem64 auto Src2 = MakeSegmentAddress(Op, Op->Src[1]); // Bits[63:0] come from Src1[63:0] // Bits[127:64] come from Src2[63:0] Ref Result_Low = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, 1, Src2); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } else { // VMOVLHPS xmm1, xmm2, xmm3 auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); Ref Result_Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Src1.Low, Src2.Low); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } } void OpDispatchBuilder::AVX128_VMOVDDUP(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; const auto IsSrcGPR = Op->Src[0].IsGPR(); RefPair Src {}; if (IsSrcGPR) { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); } else { // Accesses from memory are a little weird. // 128-bit operation only loads 8-bytes. // 256-bit operation loads a full 32-bytes. if (Is128Bit) { Src.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], OpSize::i64Bit, Op->Flags); } else { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true); } } if (Is128Bit) { // Duplicate Src[63:0] in to low 128-bits auto Result_Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0); Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = ZeroVector}); } else { // Duplicate Src.Low[63:0] in to low 128-bits auto Result_Low = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0); // Duplicate Src.High[63:0] in to high 128-bits auto Result_High = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 0); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = Result_High}); } } void OpDispatchBuilder::AVX128_VMOVSLDUP(OpcodeArgs) { AVX128_VectorUnaryImpl(Op, OpSizeFromSrc(Op), OpSize::i32Bit, [this](IR::OpSize ElementSize, Ref Src) { return _VTrn(OpSize::i128Bit, ElementSize, Src, Src); }); } void OpDispatchBuilder::AVX128_VMOVSHDUP(OpcodeArgs) { AVX128_VectorUnaryImpl(Op, OpSizeFromSrc(Op), OpSize::i32Bit, [this](IR::OpSize ElementSize, Ref Src) { return _VTrn2(OpSize::i128Bit, ElementSize, Src, Src); }); } void OpDispatchBuilder::AVX128_VBROADCAST(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; RefPair Src {}; if (Op->Src[0].IsGPR()) { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); if (ElementSize != OpSize::i128Bit) { // Only duplicate if not VBROADCASTF128. Src.Low = _VDupElement(OpSize::i128Bit, ElementSize, Src.Low, 0); } } else { // Get the address to broadcast from into a GPR. Ref Address = MakeSegmentAddress(Op, Op->Src[0], GetGPROpSize()); Src.Low = _VBroadcastFromMem(OpSize::i128Bit, ElementSize, Address); } if (Is128Bit) { Src.High = LoadZeroVector(OpSize::i128Bit); } else { Src.High = Src.Low; } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src); } void OpDispatchBuilder::AVX128_VPUNPCKL(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPUNPCKH(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VZip2(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_MOVVectorUnaligned(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; if (!Is128Bit && Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { // Nop return; } auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); if (Is128Bit) { Src.High = LoadZeroVector(OpSize::i128Bit); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src); } void OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR(OpcodeArgs, IR::OpSize DstElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); RefPair Result {}; if (Op->Src[1].IsGPR()) { // If the source is a GPR then convert directly from the GPR. auto Src2 = LoadSourceGPR_WithOpSize(Op, Op->Src[1], GetGPROpSize(), Op->Flags); Result.Low = _VSToFGPRInsert(OpSize::i128Bit, DstElementSize, SrcSize, Src1.Low, Src2, false); } else if (SrcSize != DstElementSize) { // If the source is from memory but the Source size and destination size aren't the same, // then it is more optimal to load in to a GPR and convert between GPR->FPR. // ARM GPR->FPR conversion supports different size source and destinations while FPR->FPR doesn't. auto Src2 = LoadSourceGPR(Op, Op->Src[1], Op->Flags); Result.Low = _VSToFGPRInsert(DstSize, DstElementSize, SrcSize, Src1.Low, Src2, false); } else { // In the case of cvtsi2s{s,d} where the source and destination are the same size, // then it is more optimal to load in to the FPR register directly and convert there. auto Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); // Always signed Result.Low = _VSToFVectorInsert(DstSize, DstElementSize, DstElementSize, Src1.Low, Src2, false, false); } const auto Is128Bit = DstSize == OpSize::i128Bit; LOGMAN_THROW_A_FMT(Is128Bit, "Programming Error: This should never occur!"); Result.High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_CVTFPR_To_GPR(OpcodeArgs, IR::OpSize SrcElementSize, bool HostRoundingMode) { // If loading a vector, use the full size, so we don't // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. RefPair Src {}; if (Op->Src[0].IsGPR()) { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); } else { Src.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcElementSize, Op->Flags); } Ref Result = CVTFPR_To_GPRImpl(Op, Src.Low, SrcElementSize, HostRoundingMode); StoreResultGPR(Op, Result); } void OpDispatchBuilder::AVX128_VANDN(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::i128Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VAndn(OpSize::i128Bit, _ElementSize, Src2, Src1); }); } void OpDispatchBuilder::AVX128_VPACKSS(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VSQXTNPair(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPACKUS(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return _VSQXTUNPair(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } Ref OpDispatchBuilder::AVX128_PSIGNImpl(IR::OpSize ElementSize, Ref Src1, Ref Src2) { Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1); Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1); return _VMul(OpSize::i128Bit, ElementSize, Src1, Control); } void OpDispatchBuilder::AVX128_VPSIGN(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return AVX128_PSIGNImpl(_ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_UCOMISx(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = Op->Src[0].IsGPR() ? GetGuestVectorLength() : ElementSize; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, false); RefPair Src2 {}; // Careful here, if the source is from a GPR then we want to load the full 128-bit lower half. // If it is memory then we only want to load the element size. if (Op->Src[0].IsGPR()) { Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); } else { Src2.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); } Comiss(ElementSize, Src1.Low, Src2.Low); } void OpDispatchBuilder::AVX128_VectorScalarInsertALU(OpcodeArgs, FEXCore::IR::IROps IROp, IR::OpSize ElementSize) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); RefPair Src2 {}; if (Op->Src[1].IsGPR()) { Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); } else { Src2.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[1], SrcSize, Op->Flags); } // If OpSize == ElementSize then it only does the lower scalar op DeriveOp(Result_Low, IROp, _VFAddScalarInsert(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, false)); auto High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, RefPair {.Low = Result_Low, .High = High}); } void OpDispatchBuilder::AVX128_VFCMP(OpcodeArgs, IR::OpSize ElementSize) { const uint8_t CompType = Op->Src[2].Literal(); struct { FEXCore::X86Tables::DecodedOp Op; uint32_t CompType {}; } Capture { .Op = Op, .CompType = CompType & 0b11111u, }; AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, &Capture](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return VFCMPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2, Capture.CompType); }); } void OpDispatchBuilder::AVX128_InsertScalarFCMP(OpcodeArgs, IR::OpSize ElementSize) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); RefPair Src2 {}; if (Op->Src[1].IsGPR()) { Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); } else { Src2.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[1], SrcSize, Op->Flags); } const uint8_t CompType = Op->Src[2].Literal(); RefPair Result {}; Result.Low = InsertScalarFCMPOpImpl(OpSize::i128Bit, OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, CompType & 0b11111, false); Result.High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR(OpcodeArgs) { if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= FEXCore::X86State::REG_XMM_0) { ///< XMM <- Reg/Mem RefPair Result {}; if (Op->Src[0].IsGPR()) { // Loading from GPR and moving to Vector. Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], GetGPROpSize(), Op->Flags); // zext to 128bit Result.Low = _VCastFromGPR(OpSize::i128Bit, OpSizeFromSrc(Op), Src); } else { // Loading from Memory as a scalar. Zero extend Result.Low = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } Result.High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } else { ///< Reg/Mem <- XMM auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); if (Op->Dest.IsGPR()) { auto ElementSize = OpSizeFromDst(Op); // Extract element from GPR. Zero extending in the process. Src.Low = _VExtractToGPR(OpSizeFromSrc(Op), ElementSize, Src.Low, 0); StoreResultGPR(Op, Op->Dest, Src.Low); } else { // Storing first element to memory. Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); _StoreMemFPR(OpSizeFromDst(Op), Dest, Src.Low, OpSize::i8Bit); } } } void OpDispatchBuilder::AVX128_PExtr(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); uint64_t Index = Op->Src[1].Literal(); // Fixup of 32-bit element size. // When the element size is 32-bit then it can be overriden as 64-bit because the encoding of PEXTRD/PEXTRQ // is the same except that REX.W or VEX.W is set to 1. Incredibly frustrating. // Use the destination size as the element size in this case. auto OverridenElementSize = ElementSize; if (ElementSize == OpSize::i32Bit) { OverridenElementSize = DstSize; } // AVX version only operates on 128-bit. const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize); Index &= NumElements - 1; if (Op->Dest.IsGPR()) { const auto GPRSize = GetGPROpSize(); // Extract already zero extends the result. Ref Result = _VExtractToGPR(OpSize::i128Bit, OverridenElementSize, Src.Low, Index); StoreResultGPR_WithOpSize(Op, Op->Dest, Result, GPRSize); return; } // If we are storing to memory then we store the size of the element extracted Ref Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(OpSize::i128Bit, OverridenElementSize, Src.Low, Index, Dest); } void OpDispatchBuilder::AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed) { const auto DstSize = OpSizeFromDst(Op); const auto GetSrc = [&] { if (Op->Src[0].IsGPR()) { return AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false).Low; } else { // For memory operands the 256-bit variant loads twice the size specified in the table. const auto Is256Bit = DstSize == OpSize::i256Bit; const auto SrcSize = OpSizeFromSrc(Op); const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize; return LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags); } }; auto Transform = [=, this](Ref Src) { for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) { if (Signed) { Src = _VSXTL(OpSize::i128Bit, CurrentElementSize, Src); } else { Src = _VUXTL(OpSize::i128Bit, CurrentElementSize, Src); } } return Src; }; Ref Src = GetSrc(); RefPair Result {}; if (DstSize == OpSize::i128Bit) { // 128-bit operation is easy, it stays within the single register. Result.Low = Transform(Src); } else { // 256-bit operation is a bit special. It splits the incoming source between lower and upper registers. size_t TotalElementCount = IR::NumElements(OpSize::i256Bit, DstElementSize); size_t TotalElementsToSplitSize = (TotalElementCount / 2) * IR::OpSizeToSize(ElementSize); // Split the number of elements in half between lower and upper. Ref SrcHigh = _VDupElement(OpSize::i128Bit, IR::SizeToOpSize(TotalElementsToSplitSize), Src, 1); Result.Low = Transform(Src); Result.High = Transform(SrcHigh); } if (DstSize == OpSize::i128Bit) { // Regular zero-extending semantics. Result.High = LoadZeroVector(OpSize::i128Bit); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_MOVMSK(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is128Bit = SrcSize == OpSize::i128Bit; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Mask8Byte = [this](Ref Src) { // UnZip2 the 64-bit elements as 32-bit to get the sign bits closer. // Sign bits are now in bit positions 31 and 63 after this. Src = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); // Extract the low 64-bits to GPR in one move. Ref GPR = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 0); // BFI the sign bit in 31 in to 62. // Inserting the full lower 32-bits offset 31 so the sign bit ends up at offset 63. GPR = _Bfi(OpSize::i64Bit, 32, 31, GPR, GPR); // Shift right to only get the two sign bits we care about. return _Lshr(OpSize::i64Bit, GPR, Constant(62)); }; auto Mask4Byte = [this](Ref Src) { // Shift all the sign bits to the bottom of their respective elements. Src = _VUShrI(OpSize::i128Bit, OpSize::i32Bit, Src, 31); // Load the specific 128-bit movmskps shift elements operator. auto ConstantUSHL = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NAMED_VECTOR_MOVMSKPS_SHIFT); // Shift the sign bits in to specific locations. Src = _VUShl(OpSize::i128Bit, OpSize::i32Bit, Src, ConstantUSHL, false); // Add across the vector so the sign bits will end up in bits [3:0] Src = _VAddV(OpSize::i128Bit, OpSize::i32Bit, Src); // Extract to a GPR. return _VExtractToGPR(OpSize::i128Bit, OpSize::i32Bit, Src, 0); }; Ref GPR {}; if (Is128Bit) { if (ElementSize == OpSize::i64Bit) { GPR = Mask8Byte(Src.Low); } else { GPR = Mask4Byte(Src.Low); } } else if (ElementSize == OpSize::i32Bit) { auto GPRLow = Mask4Byte(Src.Low); auto GPRHigh = Mask4Byte(Src.High); GPR = _Orlshl(OpSize::i64Bit, GPRLow, GPRHigh, 4); } else { auto GPRLow = Mask8Byte(Src.Low); auto GPRHigh = Mask8Byte(Src.High); GPR = _Orlshl(OpSize::i64Bit, GPRLow, GPRHigh, 2); } StoreResultGPR_WithOpSize(Op, Op->Dest, GPR, GetGPROpSize()); } void OpDispatchBuilder::AVX128_MOVMSKB(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); Ref VMask = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NAMED_VECTOR_MOVMASKB); auto Mask1Byte = [this](Ref Src, Ref VMask) { auto VCMP = _VCMPLTZ(OpSize::i128Bit, OpSize::i8Bit, Src); auto VAnd = _VAnd(OpSize::i128Bit, OpSize::i8Bit, VCMP, VMask); auto VAdd1 = _VAddP(OpSize::i128Bit, OpSize::i8Bit, VAnd, VAnd); auto VAdd2 = _VAddP(OpSize::i128Bit, OpSize::i8Bit, VAdd1, VAdd1); auto VAdd3 = _VAddP(OpSize::i64Bit, OpSize::i8Bit, VAdd2, VAdd2); ///< 16-bits of data per 128-bit return _VExtractToGPR(OpSize::i128Bit, OpSize::i16Bit, VAdd3, 0); }; Ref Result = Mask1Byte(Src.Low, VMask); if (!Is128Bit) { auto ResultHigh = Mask1Byte(Src.High, VMask); Result = _Orlshl(OpSize::i64Bit, Result, ResultHigh, 16); } StoreResultGPR(Op, Result); } void OpDispatchBuilder::AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) { const auto NumElements = IR::NumElements(OpSize::i128Bit, ElementSize); const uint64_t Index = Imm.Literal() & (NumElements - 1); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Src1Op, Op->Flags, false); RefPair Result {}; if (Src2Op.IsGPR()) { // If the source is a GPR then convert directly from the GPR. auto Src2 = LoadSourceGPR_WithOpSize(Op, Src2Op, GetGPROpSize(), Op->Flags); Result.Low = _VInsGPR(OpSize::i128Bit, ElementSize, Index, Src1.Low, Src2); } else { // If loading from memory then we only load the element size auto Src2 = MakeSegmentAddress(Op, Src2Op); Result.Low = _VLoadVectorElement(OpSize::i128Bit, ElementSize, Src1.Low, Index, Src2); } Result.High = LoadZeroVector(OpSize::i128Bit); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VPINSRB(OpcodeArgs) { AVX128_PINSRImpl(Op, OpSize::i8Bit, Op->Src[0], Op->Src[1], Op->Src[2]); } void OpDispatchBuilder::AVX128_VPINSRW(OpcodeArgs) { AVX128_PINSRImpl(Op, OpSize::i16Bit, Op->Src[0], Op->Src[1], Op->Src[2]); } void OpDispatchBuilder::AVX128_VPINSRDQ(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); AVX128_PINSRImpl(Op, SrcSize, Op->Src[0], Op->Src[1], Op->Src[2]); } void OpDispatchBuilder::AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSizeFromSrc(Op), [this, IROp](IR::OpSize ElementSize, Ref Src1, Ref Src2) { DeriveOp(Shift, IROp, _VUShr(OpSize::i128Bit, ElementSize, Src1, Src2, true)); return Shift; }); } void OpDispatchBuilder::AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; const bool Right = Dir == ShiftDirection::RIGHT; const uint64_t Shift = Op->Src[1].Literal(); const uint64_t ExtrShift = Right ? Shift : IR::OpSizeToSize(OpSize::i128Bit) - Shift; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); RefPair Result {}; if (Shift == 0) [[unlikely]] { Result = Src; } else if (Shift >= Core::CPUState::XMM_SSE_REG_SIZE) { Result.Low = LoadZeroVector(OpSize::i128Bit); Result.High = Result.Low; } else { Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); RefPair Zero {ZeroVector, ZeroVector}; RefPair Src1 = Right ? Zero : Src; RefPair Src2 = Right ? Src : Zero; Result.Low = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1.Low, Src2.Low, ExtrShift); if (!Is128Bit) { Result.High = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1.High, Src2.High, ExtrShift); } } if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VINSERT(OpcodeArgs) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; const auto Selector = Op->Src[2].Literal() & 1; auto Result = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); if (Selector == 0) { // Insert in to Low bits Result.Low = Src2.Low; } else { // Insert in to the High bits Result.High = Src2.Low; } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VINSERTPS(OpcodeArgs) { Ref Result = InsertPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]); AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(Result)); } void OpDispatchBuilder::AVX128_VPHSUB(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHSUBOpImpl(OpSize::i128Bit, Src1, Src2, _ElementSize); }); } void OpDispatchBuilder::AVX128_VPHSUBSW(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHSUBSOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VADDSUBP(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return ADDSUBPOpImpl(OpSize::i128Bit, _ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMULL(OpcodeArgs, IR::OpSize ElementSize, bool Signed) { LOGMAN_THROW_A_FMT(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit"); AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [&](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { return PMULLOpImpl(OpSize::i128Bit, _ElementSize, Signed, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMULHRSW(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { return PMULHRSWOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMULHW(OpcodeArgs, bool Signed) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [&](IR::OpSize _ElementSize, Ref Src1, Ref Src2) -> Ref { if (Signed) { return _VSMulH(OpSize::i128Bit, _ElementSize, Src1, Src2); } else { return _VUMulH(OpSize::i128Bit, _ElementSize, Src1, Src2); } }); } void OpDispatchBuilder::AVX128_InsertScalar_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize) { // Gotta be careful with this operation. // It inserts in to the lowest element, retaining the remainder of the lower 128-bits. // Then zero extends the top 128-bit. const auto SrcSize = Op->Src[1].IsGPR() ? OpSize::i128Bit : SrcElementSize; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); Ref Result = _VFToFScalarInsert(OpSize::i128Bit, DstElementSize, SrcElementSize, Src1.Low, Src2, false); AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(Result)); } void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit; auto Is128BitSrc = SrcSize == OpSize::i128Bit; auto Is128BitDst = DstSize == OpSize::i128Bit; ///< Decompose correctly. if (DstElementSize > SrcElementSize && !Is128BitDst) { Is128BitSrc = true; } else if (SrcElementSize > DstElementSize && !Is128BitSrc) { Is128BitDst = true; } const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2) : SrcSize; RefPair Src {}; if (Op->Src[0].IsGPR() || LoadSize >= OpSize::i128Bit) { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128BitSrc); } else { // Handle 64-bit memory source. // In the case of cvtps2pd xmm, m64. Src.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags); } RefPair Result {}; auto TransformLow = [&](Ref Src) -> Ref { return _Vector_FToF(OpSize::i128Bit, DstElementSize, Src, SrcElementSize); }; auto TransformHigh = [&](Ref Src) -> Ref { return _VFCVTL2(OpSize::i128Bit, SrcElementSize, Src); }; Result.Low = TransformLow(Src.Low); if (Is128BitSrc) { if (Is128BitDst) { // cvtps2pd xmm, xmm or cvtpd2ps xmm, xmm // Done here } else { LOGMAN_THROW_A_FMT(DstElementSize > SrcElementSize, "cvtpd2ps ymm, xmm doesn't exist"); // cvtps2pd ymm, xmm Result.High = TransformHigh(Src.Low); } } else { // 256-bit src LOGMAN_THROW_A_FMT(Is128BitDst, "Not real: cvt{ps2pd,pd2ps} ymm, ymm"); LOGMAN_THROW_A_FMT(DstElementSize < SrcElementSize, "cvtps2pd xmm, ymm doesn't exist"); // cvtpd2ps xmm, ymm Result.Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 0, Result.Low, TransformLow(Src.High)); } if (Is128BitDst) { Result = AVX128_Zext(Result.Low); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs, IR::OpSize SrcElementSize, bool HostRoundingMode) { const auto SrcSize = GetSrcSize(Op); const auto Is128BitSrc = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; // VCVTPD2DQ/VCVTTPD2DQ only use the bottom lane, even for the 256-bit version. auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128BitSrc); RefPair Result {}; Result.Low = Vector_CVT_Float_To_Int32Impl(Op, OpSize::i128Bit, Src.Low, OpSize::i128Bit, SrcElementSize, HostRoundingMode, Is128BitSrc); if (Is128BitSrc) { // Zero the upper 128-bit lane of the result. Result = AVX128_Zext(Result.Low); } else { Result.High = Vector_CVT_Float_To_Int32Impl(Op, OpSize::i128Bit, Src.High, OpSize::i128Bit, SrcElementSize, HostRoundingMode, false); // Also convert the upper 128-bit lane if (SrcElementSize == OpSize::i64Bit) { // Zip the two halves together in to the lower 128-bits Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High); // Zero the upper 128-bit lane of the result. Result = AVX128_Zext(Result.Low); } } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float(OpcodeArgs, IR::OpSize SrcElementSize, bool Widen) { const auto Size = OpSizeFromDst(Op); const auto Is128Bit = Size == OpSize::i128Bit; RefPair Src = [&] { if (Widen && !Op->Src[0].IsGPR()) { // If loading a vector, use the full size, so we don't // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. const auto LoadSize = IR::SizeToOpSize(8 * (IR::OpSizeToSize(Size) / 16)); return RefPair {.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags)}; } else { return AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); } }(); auto Convert = [&](Ref Src, IROps Op) -> Ref { auto ElementSize = SrcElementSize; if (Widen) { DeriveOp(Extended, Op, _VSXTL(OpSize::i128Bit, ElementSize, Src)); Src = Extended; ElementSize = ElementSize << 1; } return _Vector_SToF(OpSize::i128Bit, ElementSize, Src); }; RefPair Result {}; Result.Low = Convert(Src.Low, IROps::OP_VSXTL); if (Is128Bit) { Result = AVX128_Zext(Result.Low); } else { if (Widen) { Result.High = Convert(Src.Low, IROps::OP_VSXTL2); } else { Result.High = Convert(Src.High, IROps::OP_VSXTL); } } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VEXTRACT128(OpcodeArgs) { const auto DstIsXMM = Op->Dest.IsGPR(); const auto Selector = Op->Src[1].Literal() & 0b1; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true); RefPair Result {}; if (Selector == 0) { Result.Low = Src.Low; } else { Result.Low = Src.High; } if (DstIsXMM) { // Only zero the upper-half when destination is XMM, otherwise this is a memory store. Result = AVX128_Zext(Result.Low); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VAESImc(OpcodeArgs) { ///< 128-bit only. AVX128_VectorUnaryImpl(Op, OpSize::i128Bit, OpSize::i128Bit, [this](IR::OpSize, Ref Src) { return _VAESImc(Src); }); } void OpDispatchBuilder::AVX128_VAESEnc(OpcodeArgs) { AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEnc(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESEncLast(OpcodeArgs) { AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESEncLast(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESDec(OpcodeArgs) { AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDec(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESDecLast(OpcodeArgs) { AVX128_VectorTrinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, LoadZeroVector(OpSize::i128Bit), [this](IR::OpSize, Ref Src1, Ref Src2, Ref Src3) { return _VAESDecLast(OpSize::i128Bit, Src1, Src2, Src3); }); } void OpDispatchBuilder::AVX128_VAESKeyGenAssist(OpcodeArgs) { ///< 128-bit only. const uint64_t RCON = Op->Src[1].Literal(); auto ZeroRegister = LoadZeroVector(OpSize::i128Bit); auto KeyGenSwizzle = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NAMED_VECTOR_AESKEYGENASSIST_SWIZZLE); struct { Ref ZeroRegister; Ref KeyGenSwizzle; uint64_t RCON; } Capture { .ZeroRegister = ZeroRegister, .KeyGenSwizzle = KeyGenSwizzle, .RCON = RCON, }; AVX128_VectorUnaryImpl(Op, OpSize::i128Bit, OpSize::i128Bit, [this, &Capture](IR::OpSize, Ref Src) { return _VAESKeyGenAssist(Src, Capture.KeyGenSwizzle, Capture.ZeroRegister, Capture.RCON); }); } void OpDispatchBuilder::AVX128_VPCMPESTRI(OpcodeArgs) { PCMPXSTRXOpImpl(Op, true, false); ///< Does not zero anything. } void OpDispatchBuilder::AVX128_VPCMPESTRM(OpcodeArgs) { PCMPXSTRXOpImpl(Op, true, true); ///< Zero the upper 128-bits of hardcoded YMM0 AVX128_StoreXMMRegister(0, LoadZeroVector(OpSize::i128Bit), true); } void OpDispatchBuilder::AVX128_VPCMPISTRI(OpcodeArgs) { PCMPXSTRXOpImpl(Op, false, false); ///< Does not zero anything. } void OpDispatchBuilder::AVX128_VPCMPISTRM(OpcodeArgs) { PCMPXSTRXOpImpl(Op, false, true); ///< Zero the upper 128-bits of hardcoded YMM0 AVX128_StoreXMMRegister(0, LoadZeroVector(OpSize::i128Bit), true); } void OpDispatchBuilder::AVX128_PHMINPOSUW(OpcodeArgs) { Ref Result = PHMINPOSUWOpImpl(Op); AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(Result)); } void OpDispatchBuilder::AVX128_VectorRound(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); const auto Mode = Op->Src[1].Literal(); AVX128_VectorUnaryImpl(Op, Size, ElementSize, [this, Mode](IR::OpSize ElementSize, Ref Src) { return VectorRoundImpl(OpSize::i128Bit, ElementSize, Src, Mode); }); } void OpDispatchBuilder::AVX128_InsertScalarRound(OpcodeArgs, IR::OpSize ElementSize) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false); RefPair Src2 {}; if (Op->Src[1].IsGPR()) { Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false); } else { Src2.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[1], SrcSize, Op->Flags); } // If OpSize == ElementSize then it only does the lower scalar op const auto SourceMode = TranslateRoundType(Op->Src[2].Literal()); Ref Result = _VFToIScalarInsert(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, SourceMode, false); AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(Result)); } void OpDispatchBuilder::AVX128_VDPP(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t Literal = Op->Src[2].Literal(); AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this, Literal](IR::OpSize ElementSize, Ref Src1, Ref Src2) { return DPPOpImpl(OpSize::i128Bit, Src1, Src2, Literal, ElementSize); }); } void OpDispatchBuilder::AVX128_VPERMQ(OpcodeArgs) { ///< Only ever 256-bit. auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true); const auto Selector = Op->Src[1].Literal(); RefPair Result {}; // Crack the operation in to two halves and implement per half uint8_t SelectorLow = Selector & 0b1111; uint8_t SelectorHigh = (Selector >> 4) & 0b1111; auto SelectLane = [this](uint8_t Selector, RefPair Src) -> Ref { LOGMAN_THROW_A_FMT(Selector < 16, "Selector too large!"); switch (Selector) { case 0b00'00: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 0); case 0b00'01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.Low, Src.Low, 8); case 0b00'10: return _VZip(OpSize::i128Bit, OpSize::i64Bit, Src.High, Src.Low); case 0b00'11: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.Low, Src.High, 8); case 0b01'00: return Src.Low; case 0b01'01: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.Low, 1); case 0b01'10: return _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src.High, Src.Low); case 0b01'11: return _VTrn2(OpSize::i128Bit, OpSize::i64Bit, Src.High, Src.Low); case 0b10'00: return _VZip(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Src.High); case 0b10'01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.High, Src.Low, 8); case 0b10'10: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 0); case 0b10'11: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src.High, Src.High, 8); case 0b11'00: return _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src.Low, Src.High); case 0b11'01: return _VTrn2(OpSize::i128Bit, OpSize::i64Bit, Src.Low, Src.High); case 0b11'10: return Src.High; case 0b11'11: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src.High, 1); default: FEX_UNREACHABLE; } }; Result.Low = SelectLane(SelectorLow, Src); Result.High = SelectorLow == SelectorHigh ? Result.Low : SelectLane(SelectorHigh, Src); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VPSHUFW(OpcodeArgs, bool Low) { auto Shuffle = Op->Src[1].Literal(); struct DataPacking { OpDispatchBuilder* This; uint8_t Shuffle; bool Low; }; DataPacking Pack { .This = this, .Shuffle = static_cast(Shuffle), .Low = Low, }; AVX128_VectorUnaryImpl(Op, OpSizeFromSrc(Op), OpSize::i16Bit, [Pack](IR::OpSize, Ref Src) { const auto IndexedVectorConstant = Pack.Low ? FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW : FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW; return Pack.This->PShufWLane(OpSize::i128Bit, IndexedVectorConstant, Pack.Low, Src, Pack.Shuffle); }); } void OpDispatchBuilder::AVX128_VSHUF(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; auto Shuffle = Op->Src[2].Literal(); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Result {}; Result.Low = SHUFOpImpl(Op, OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, Shuffle); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { const uint8_t ShiftAmount = ElementSize == OpSize::i32Bit ? 0 : 2; Result.High = SHUFOpImpl(Op, OpSize::i128Bit, ElementSize, Src1.High, Src2.High, Shuffle >> ShiftAmount); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VPERMILImm(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; const auto Selector = Op->Src[1].Literal() & 0xFF; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); RefPair Result = AVX128_Zext(LoadZeroVector(OpSize::i128Bit)); if (ElementSize == OpSize::i64Bit) { auto DoSwizzle64 = [this](Ref Src, uint8_t Selector) -> Ref { switch (Selector) { case 0b00: case 0b11: return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src, Selector & 1); case 0b01: return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 8); case 0b10: // No swizzle return Src; default: FEX_UNREACHABLE; } }; Result.Low = DoSwizzle64(Src.Low, Selector & 0b11); if (!Is128Bit) { Result.High = DoSwizzle64(Src.High, (Selector >> 2) & 0b11); } } else { Result.Low = Single128Bit4ByteVectorShuffle(Src.Low, Selector); if (!Is128Bit) { Result.High = Single128Bit4ByteVectorShuffle(Src.High, Selector); } } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VHADDP(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [&](IR::OpSize ElementSize, Ref Src1, Ref Src2) { DeriveOp(Res, IROp, _VFAddP(OpSize::i128Bit, ElementSize, Src1, Src2)); return Res; }); } void OpDispatchBuilder::AVX128_VPHADDSW(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i16Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PHADDSOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMADDUBSW(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDUBSWOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPMADDWD(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i128Bit, [this](IR::OpSize _ElementSize, Ref Src1, Ref Src2) { return PMADDWDOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VBLEND(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is128Bit = SrcSize == OpSize::i128Bit; const uint64_t Selector = Op->Src[2].Literal(); ///< High Selector shift depends on element size: /// i16Bit: Reuses same bits, no shift /// i32Bit: Shift by 4 /// i64Bit: Shift by 2 const uint64_t SelectorShift = ElementSize == OpSize::i64Bit ? 2 : ElementSize == OpSize::i32Bit ? 4 : 0; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Result {}; Result.Low = VectorBlend(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low, Selector); if (Is128Bit) { Result = AVX128_Zext(Result.Low); } else { Result.High = VectorBlend(OpSize::i128Bit, ElementSize, Src1.High, Src2.High, (Selector >> SelectorShift)); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VHSUBP(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), ElementSize, [&](IR::OpSize, Ref Src1, Ref Src2) { return HSUBPOpImpl(OpSize::i128Bit, ElementSize, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VPSHUFB(OpcodeArgs) { auto MaskVector = GeneratePSHUFBMask(OpSize::i128Bit); AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit, [this, MaskVector](IR::OpSize, Ref Src1, Ref Src2) { return PSHUFBOpImpl(OpSize::i128Bit, Src1, Src2, MaskVector); }); } void OpDispatchBuilder::AVX128_VPSADBW(OpcodeArgs) { AVX128_VectorBinaryImpl(Op, OpSizeFromDst(Op), OpSize::i8Bit, [this](IR::OpSize, Ref Src1, Ref Src2) { return PSADBWOpImpl(OpSize::i128Bit, Src1, Src2); }); } void OpDispatchBuilder::AVX128_VMPSADBW(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; const uint64_t Selector = Op->Src[2].Literal(); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Result {}; auto ZeroRegister = LoadZeroVector(OpSize::i128Bit); Result.Low = MPSADBWOpImpl(OpSize::i128Bit, Src1.Low, Src2.Low, Selector); if (Is128Bit) { Result.High = ZeroRegister; } else { Result.High = MPSADBWOpImpl(OpSize::i128Bit, Src1.High, Src2.High, Selector >> 3); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VPALIGNR(OpcodeArgs) { const auto Index = Op->Src[2].Literal(); const auto Size = OpSizeFromDst(Op); const auto SanitizedDstSize = std::min(Size, OpSize::i128Bit); AVX128_VectorBinaryImpl(Op, Size, SanitizedDstSize, [this, Index](IR::OpSize SanitizedDstSize, Ref Src1, Ref Src2) -> Ref { if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) { // If the immediate is greater than both vectors combined then it zeroes the vector return LoadZeroVector(OpSize::i128Bit); } if (Index == 0) { return Src2; } if (Index == 16) { return Src1; } auto SanitizedIndex = Index; if (Index > 16) { Src2 = Src1; Src1 = LoadZeroVector(OpSize::i128Bit); SanitizedIndex -= 16; } return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src2, SanitizedIndex); }); } void OpDispatchBuilder::AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp) { const auto Is128Bit = DstSize == OpSize::i128Bit; auto Mask = AVX128_LoadSource_WithOpSize(Op, MaskOp, Op->Flags, !Is128Bit); const auto MakeAddress = [this, Op](const X86Tables::DecodedOperand& Data) { return MakeSegmentAddress(Op, Data, GetGPROpSize()); }; if (IsStore) { auto Address = MakeAddress(Op->Dest); auto Data = AVX128_LoadSource_WithOpSize(Op, DataOp, Op->Flags, !Is128Bit); _VStoreVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Data.Low, Address, Invalid(), MemOffsetType::SXTX, 1); if (!Is128Bit) { _VStoreVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, Data.High, Address, _InlineConstant(16), MemOffsetType::SXTX, 1); } } else { auto Address = MakeAddress(DataOp); RefPair Result {}; Result.Low = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.Low, Address, Invalid(), MemOffsetType::SXTX, 1); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { Result.High = _VLoadVectorMasked(OpSize::i128Bit, ElementSize, Mask.High, Address, _InlineConstant(16), MemOffsetType::SXTX, 1); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } } void OpDispatchBuilder::AVX128_VPMASKMOV(OpcodeArgs, bool IsStore) { AVX128_VMASKMOVImpl(Op, OpSizeFromSrc(Op), OpSizeFromDst(Op), IsStore, Op->Src[0], Op->Src[1]); } void OpDispatchBuilder::AVX128_VMASKMOV(OpcodeArgs, IR::OpSize ElementSize, bool IsStore) { AVX128_VMASKMOVImpl(Op, ElementSize, OpSizeFromDst(Op), IsStore, Op->Src[0], Op->Src[1]); } void OpDispatchBuilder::AVX128_MASKMOV(OpcodeArgs) { ///< This instruction only supports 128-bit. const auto Size = OpSizeFromSrc(Op); const auto Is128Bit = Size == OpSize::i128Bit; auto MaskSrc = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); // Mask only cares about the top bit of each byte MaskSrc.Low = _VCMPLTZ(Size, OpSize::i8Bit, MaskSrc.Low); // Vector that will overwrite byte elements. auto VectorSrc = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); // RDI source (DS prefix by default) auto MemDest = MakeSegmentAddress(X86State::REG_RDI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); Ref XMMReg = _LoadMemFPR(Size, MemDest, OpSize::i8Bit); // If the Mask element high bit is set then overwrite the element with the source, else keep the memory variant XMMReg = _VBSL(Size, MaskSrc.Low, VectorSrc.Low, XMMReg); _StoreMemFPR(Size, MemDest, XMMReg, OpSize::i8Bit); } void OpDispatchBuilder::AVX128_VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); const auto Is128Bit = Size == OpSize::i128Bit; const auto Src3Selector = Op->Src[2].Literal(); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); uint8_t MaskRegister = (Src3Selector >> 4) & 0b1111; RefPair Mask {.Low = AVX128_LoadXMMRegister(MaskRegister, false)}; if (!Is128Bit) { Mask.High = AVX128_LoadXMMRegister(MaskRegister, true); } auto Convert = [&](Ref Src1, Ref Src2, Ref Mask) { const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize); Ref Shifted = _VSShrI(OpSize::i128Bit, ElementSize, Mask, ElementSizeBits - 1); return _VBSL(OpSize::i128Bit, Shifted, Src2, Src1); }; RefPair Result {}; Result.Low = Convert(Src1.Low, Src2.Low, Mask.Low); if (!Is128Bit) { Result.High = Convert(Src1.High, Src2.High, Mask.High); } else { Result = AVX128_Zext(Result.Low); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_SaveAVXState(Ref MemBase) { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i += 2) { RefPair Pair = LoadContextPair(OpSize::i128Bit, AVXHigh0Index + i); _StoreMemPairFPR(OpSize::i128Bit, Pair.Low, Pair.High, MemBase, i * 16 + 576); } } void OpDispatchBuilder::AVX128_RestoreAVXState(Ref MemBase) { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i += 2) { auto YMMHRegs = LoadMemPairFPR(OpSize::i128Bit, MemBase, i * 16 + 576); AVX128_StoreXMMRegister(i, YMMHRegs.Low, true); AVX128_StoreXMMRegister(i + 1, YMMHRegs.High, true); } } void OpDispatchBuilder::AVX128_DefaultAVXState() { const auto NumRegs = Is64BitMode ? 16U : 8U; auto ZeroRegister = LoadZeroVector(OpSize::i128Bit); for (uint32_t i = 0; i < NumRegs; i++) { AVX128_StoreXMMRegister(i, ZeroRegister, true); } } void OpDispatchBuilder::AVX128_VPERM2(OpcodeArgs) { auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, true); const auto Selector = Op->Src[2].Literal(); RefPair Result = AVX128_Zext(LoadZeroVector(OpSize::i128Bit)); Ref Elements[4] = {Src1.Low, Src1.High, Src2.Low, Src2.High}; if ((Selector & 0b00001000) == 0) { Result.Low = Elements[Selector & 0b11]; } if ((Selector & 0b10000000) == 0) { Result.High = Elements[(Selector >> 4) & 0b11]; } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VTESTP(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = GetSrcSize(Op); const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); // For 128-bit, we use the common path. if (Is128Bit) { VTESTOpImpl(OpSize::i128Bit, ElementSize, Src1.Low, Src2.Low); return; } // For 256-bit, we need to split up the operation. This is nontrivial. // Let's go the simple route here. Ref ZF, CFInv; const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize); { // Calculate ZF first. auto AndLow = _VAnd(OpSize::i128Bit, OpSize::i8Bit, Src2.Low, Src1.Low); auto AndHigh = _VAnd(OpSize::i128Bit, OpSize::i8Bit, Src2.High, Src1.High); auto ShiftLow = _VUShrI(OpSize::i128Bit, ElementSize, AndLow, ElementSizeInBits - 1); auto ShiftHigh = _VUShrI(OpSize::i128Bit, ElementSize, AndHigh, ElementSizeInBits - 1); // Only have the signs now, add it all auto AddResult = _VAdd(OpSize::i128Bit, ElementSize, ShiftHigh, ShiftLow); Ref AddWide {}; if (ElementSize == OpSize::i32Bit) { AddWide = _VAddV(OpSize::i128Bit, ElementSize, AddResult); } else { AddWide = _VAddP(OpSize::i128Bit, ElementSize, AddResult, AddResult); } // ExtGPR will either be [0, 8] or [0, 16] If 0 then set Flag. ZF = _VExtractToGPR(OpSize::i128Bit, ElementSize, AddWide, 0); } { // Calculate CF Second auto AndLow = _VAndn(OpSize::i128Bit, OpSize::i8Bit, Src2.Low, Src1.Low); auto AndHigh = _VAndn(OpSize::i128Bit, OpSize::i8Bit, Src2.High, Src1.High); auto ShiftLow = _VUShrI(OpSize::i128Bit, ElementSize, AndLow, ElementSizeInBits - 1); auto ShiftHigh = _VUShrI(OpSize::i128Bit, ElementSize, AndHigh, ElementSizeInBits - 1); // Only have the signs now, add it all auto AddResult = _VAdd(OpSize::i128Bit, ElementSize, ShiftHigh, ShiftLow); Ref AddWide {}; if (ElementSize == OpSize::i32Bit) { AddWide = _VAddV(OpSize::i128Bit, ElementSize, AddResult); } else { AddWide = _VAddP(OpSize::i128Bit, ElementSize, AddResult, AddResult); } // ExtGPR will either be [0, 8] or [0, 16] If 0 then set Flag. auto ExtGPR = _VExtractToGPR(OpSize::i128Bit, ElementSize, AddWide, 0); CFInv = To01(OpSize::i64Bit, ExtGPR); } // As in PTest, this sets Z appropriately while zeroing the rest of NZCV. SetNZ_ZeroCV(OpSize::i32Bit, ZF); SetCFInverted(CFInv); ZeroPF_AF(); } void OpDispatchBuilder::AVX128_PTest(OpcodeArgs) { const auto Size = GetSrcSize(Op); const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); // For 128-bit, use the common path. if (Is128Bit) { PTestOpImpl(OpSize::i128Bit, Src1.Low, Src2.Low); return; } // For 256-bit, we need to unroll. This is nontrivial. Ref Test1Low = _VAnd(OpSize::i128Bit, OpSize::i8Bit, Src1.Low, Src2.Low); Ref Test2Low = _VAndn(OpSize::i128Bit, OpSize::i8Bit, Src2.Low, Src1.Low); Ref Test1High = _VAnd(OpSize::i128Bit, OpSize::i8Bit, Src1.High, Src2.High); Ref Test2High = _VAndn(OpSize::i128Bit, OpSize::i8Bit, Src2.High, Src1.High); // Element size must be less than 32-bit for the sign bit tricks. Ref Test1Max = _VUMax(OpSize::i128Bit, OpSize::i16Bit, Test1Low, Test1High); Ref Test2Max = _VUMax(OpSize::i128Bit, OpSize::i16Bit, Test2Low, Test2High); Ref Test1 = _VUMaxV(OpSize::i128Bit, OpSize::i16Bit, Test1Max); Ref Test2 = _VUMaxV(OpSize::i128Bit, OpSize::i16Bit, Test2Max); Test1 = _VExtractToGPR(OpSize::i128Bit, OpSize::i16Bit, Test1, 0); Test2 = _VExtractToGPR(OpSize::i128Bit, OpSize::i16Bit, Test2, 0); Test2 = To01(OpSize::i64Bit, Test2); // Careful, these flags are different between {V,}PTEST and VTESTP{S,D} // Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on // the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is // cleared even if the 16-bit scalars were negative. SetNZ_ZeroCV(OpSize::i32Bit, Test1); SetCFInverted(Test2); ZeroPF_AF(); } void OpDispatchBuilder::AVX128_VPERMILReg(OpcodeArgs, IR::OpSize ElementSize) { AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), ElementSize, [this](IR::OpSize _ElementSize, Ref Src, Ref Indices) { return VPERMILRegOpImpl(OpSize::i128Bit, _ElementSize, Src, Indices); }); } void OpDispatchBuilder::AVX128_VPERMD(OpcodeArgs) { // Only 256-bit auto Indices = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, true); auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, true); auto DoPerm = [this](RefPair Src, Ref Indices, Ref IndexMask, Ref AddVector) { Ref FinalIndices = VPERMDIndices(OpSize::i128Bit, Indices, IndexMask, AddVector); return _VTBL2(OpSize::i128Bit, Src.Low, Src.High, FinalIndices); }; RefPair Result {}; Ref IndexMask = _VectorImm(OpSize::i128Bit, OpSize::i32Bit, 0b111); Ref AddConst = Constant(0x03020100); Ref Repeating3210 = _VDupFromGPR(OpSize::i128Bit, OpSize::i32Bit, AddConst); Result.Low = DoPerm(Src, Indices.Low, IndexMask, Repeating3210); Result.High = DoPerm(Src, Indices.High, IndexMask, Repeating3210); AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VPCLMULQDQ(OpcodeArgs) { if (!CTX->HostFeatures.SupportsPMULL_128Bit) { UnimplementedOp(Op); return; } const auto Selector = static_cast(Op->Src[2].Literal()); AVX128_VectorBinaryImpl(Op, OpSizeFromSrc(Op), OpSize::iInvalid, [this, Selector](IR::OpSize, Ref Src1, Ref Src2) { return _PCLMUL(OpSize::i128Bit, Src1, Src2, Selector & 0b1'0001); }); } // FMA differences between AArch64 and x86 make this really confusing to remember how things match. // Here's a little guide for remembering how these instructions related across the architectures. // ///< AArch64 Vector FMA behaviour // FMLA vd, vn, vm // - vd = (vn * vm) + vd // FMLS vd, vn, vm // - vd = (-vn * vm) + vd // // SVE ONLY! No FNMLA or FNMLS variants until SVE! // FMLA zda, pg/m, zn, zm - Ignore predicate here // - zda = (zn * zm) + zda // FMLS zda, pg/m, zn, zm - Ignore predicate here // - zda = (-zn * zm) + zda // FNMLA zda, pg/m, zn, zm - Ignore predicate here // - zda = (-zn * zm) - zda // FNMLS zda, pg/m, zn, zm - Ignore predicate here // - zda = (zn * zm) - zda // ///< AArch64 Scalar FMA behaviour (FMA4 versions!) // All variants support 16-bit, 32-bit, and 64-bit. // FMADD d, n, m, a // - d = (n * m) + a // FMSUB d, n, m, a // - d = (-n * m) + a // FNMADD d, n, m, a // - d = (-n * m) - a // FNMSUB d, n, m, a // - d = (n * m) - a // ///< x86 FMA behaviour // ## Packed variants // - VFMADD{PD,PS}suffix src1, src2, src3/mem // - 132 - src1 = (src1 * src3) + src2 // - 213 - src1 = (src2 * src1) + src3 // - 231 - src1 = (src2 * src3) + src1 // ^ Matches ARM FMLA // // - VFMSUB{PD,PS}suffix src1, src2, src3/mem // - 132 - src1 = (src1 * src3) - src2 // - 213 - src1 = (src2 * src1) - src3 // - 231 - src1 = (src2 * src3) - src1 // ^ Matches ARM FMLA with addend negated first // ^ Or just SVE FNMLS // ^ or scalar FNMSUB // // - VFNMADD{PD,PS}suffix src1, src2, src3/mem // - 132 - src1 = (-src1 * src3) + src2 // - 213 - src1 = (-src2 * src1) + src3 // - 231 - src1 = (-src2 * src3) + src1 // ^ Matches ARM FMLS behaviour! (REALLY CONFUSINGLY NAMED!) // ^ Or Scalar FMSUB // // - VFNMSUB{PD,PS}suffix src1, src2, src3/mem // - 132 - src1 = (-src1 * src3) - src2 // - 213 - src1 = (-src2 * src1) - src3 // - 231 - src1 = (-src2 * src3) - src1 // ^ Matches ARM FMLS behaviour with addend negated first! (REALLY CONFUSINGLY NAMED!) // ^ Or just SVE FNMLA // ^ Or scalar FNMADD // // - VFNMADDSUB{PD,PS}suffix src1, src2, src3/mem // - 132 - src1.odd = (src1.odd * src3.odd) + src2.odd // - src1.even = (src1.even * src3.even) - src2.even // - 213 - src1.odd = (src2.odd * src1.odd) + src3.odd // - src1.even = (src2.even * src1.even) - src3.even // - 231 - src1.odd = (src2.odd * src3.odd) + src1.odd // - src1.even = (src2.even * src3.even) - src1.even // ^ Matches ARM FMLA behaviour with addend.even negated first! // // - VFNMSUBADD{PD,PS}suffix src1, src2, src3/mem // - 132 - src1.odd = (src1.odd * src3.odd) - src2.odd // - src1.even = (src1.even * src3.even) + src2.even // - 213 - src1.odd = (src2.odd * src1.odd) - src3.odd // - src1.even = (src2.even * src1.even) + src3.even // - 231 - src1.odd = (src2.odd * src3.odd) - src1.odd // - src1.even = (src2.even * src3.even) + src1.even // ^ Matches ARM FMLA behaviour with addend.odd negated first! // // As shown only the 231 suffixed instructions matches AArch64 behaviour. // FEX will insert moves to transpose the vectors to match AArch64 behaviour for 132 and 213 variants. void OpDispatchBuilder::AVX128_VFMAImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const auto Size = GetDstSize(Op); const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Sources[3] = {Dest, Src1, Src2}; RefPair Result {}; DeriveOp(Result_Low, IROp, _VFMLA(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].Low, Sources[Src2Idx - 1].Low, Sources[AddendIdx - 1].Low)); Result.Low = Result_Low; if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { DeriveOp(Result_High, IROp, _VFMLA(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].High, Sources[Src2Idx - 1].High, Sources[AddendIdx - 1].High)); Result.High = Result_High; } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VFMAScalarImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, false).Low; auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false).Low; Ref Src2 {}; if (Op->Src[1].IsGPR()) { Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false).Low; } else { Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], ElementSize, Op->Flags); } Ref Sources[3] = {Dest, Src1, Src2}; DeriveOp(Result_Low, IROp, _VFMLAScalarInsert(OpSize::i128Bit, ElementSize, Dest, Sources[Src1Idx - 1], Sources[Src2Idx - 1], Sources[AddendIdx - 1])); AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(Result_Low)); } void OpDispatchBuilder::AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const auto Size = GetDstSize(Op); const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE; const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit); auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); RefPair Sources[3] = { Dest, Src1, Src2, }; RefPair Result {}; Ref ConstantEOR {}; if (AddSub) { ConstantEOR = LoadAndCacheNamedVectorConstant( OpSize::i128Bit, ElementSize == OpSize::i32Bit ? NAMED_VECTOR_PADDSUBPS_INVERT : NAMED_VECTOR_PADDSUBPD_INVERT); } else { ConstantEOR = LoadAndCacheNamedVectorConstant( OpSize::i128Bit, ElementSize == OpSize::i32Bit ? NAMED_VECTOR_PSUBADDPS_INVERT : NAMED_VECTOR_PSUBADDPD_INVERT); } auto InvertedSourceLow = _VXor(OpSize::i128Bit, ElementSize, Sources[AddendIdx - 1].Low, ConstantEOR); Result.Low = _VFMLA(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].Low, Sources[Src2Idx - 1].Low, InvertedSourceLow); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); } else { auto InvertedSourceHigh = _VXor(OpSize::i128Bit, ElementSize, Sources[AddendIdx - 1].High, ConstantEOR); Result.High = _VFMLA(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].High, Sources[Src2Idx - 1].High, InvertedSourceHigh); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpcodeArgs, OpSize Size, OpSize ElementLoadSize, OpSize AddrElementSize, RefPair Dest, RefPair Mask, RefVSIB VSIB) { LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size"); const auto Is128Bit = Size == OpSize::i128Bit; ///< BaseAddr doesn't need to exist, calculate that here. Ref BaseAddr = VSIB.BaseAddr; if (BaseAddr && VSIB.Displacement) { BaseAddr = Add(OpSize::i64Bit, BaseAddr, VSIB.Displacement); } else if (VSIB.Displacement) { BaseAddr = Constant(VSIB.Displacement); } else if (!BaseAddr) { BaseAddr = Invalid(); } if (CTX->HostFeatures.SupportsSVE128) { if (ElementLoadSize == OpSize::i64Bit && AddrElementSize == OpSize::i32Bit) { // In the case that FEX is loading double the amount of data than the number of address bits then we can optimize this case. // For 256-bits of data we need to sign extend all four 32-bit address elements to be 64-bit. // For 128-bits of data we only need to sign extend the lower two 32-bit address elements. LOGMAN_THROW_A_FMT(VSIB.High == Invalid(), "Need to not have a high VSIB source"); if (!Is128Bit) { VSIB.High = _VSSHLL2(OpSize::i128Bit, OpSize::i32Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale)); } VSIB.Low = _VSSHLL(OpSize::i128Bit, OpSize::i32Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale)); ///< Set the scale to one now that it has been prescaled as well. VSIB.Scale = 1; // Set the address element size to 64-bit now that the elements are extended. AddrElementSize = OpSize::i64Bit; } else if (ElementLoadSize == OpSize::i64Bit && AddrElementSize == OpSize::i64Bit && (VSIB.Scale == 2 || VSIB.Scale == 4)) { // SVE gather instructions don't support scaling their vector elements by anything other than 1 or the address element size. // Pre-scale 64-bit addresses in the case that scale doesn't match in-order to hit SVE code paths more frequently. // Only hit this path if the host supports SVE. Otherwise it's a degradation for the ASIMD codepath. VSIB.Low = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale)); if (!Is128Bit) { VSIB.High = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.High, FEXCore::ilog2(VSIB.Scale)); } ///< Set the scale to one now that it has been prescaled. VSIB.Scale = 1; } } const auto GPRSize = GetGPROpSize(); auto AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize; RefPair Result {}; ///< Calculate the low-half. Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High, AddrElementSize, VSIB.Scale, 0, 0, AddrSize); if (Is128Bit) { Result.High = LoadZeroVector(OpSize::i128Bit); if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { // Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results. // Only loads two 32-bit elements in to the lower 64-bits of the first destination. // Bits [255:65] all become zero. Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High); } } else { RefPair AddrAddressing {}; Ref DestReg = Dest.High; Ref MaskReg = Mask.High; uint8_t IndexElementOffset {}; uint8_t DataElementOffset {}; if (AddrElementSize == ElementLoadSize) { // If the address size matches the loading element size then it will be fetching at the same rate between low and high AddrAddressing.Low = VSIB.High; AddrAddressing.High = Invalid(); } else if (AddrElementSize == OpSize::i32Bit && ElementLoadSize == OpSize::i64Bit) { // If the address element size if half the size of the Element load size then we need to start fetching half-way through the low register. AddrAddressing.Low = VSIB.Low; AddrAddressing.High = VSIB.High; IndexElementOffset = IR::NumElements(OpSize::i128Bit, AddrElementSize) / 2; } else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { AddrAddressing.Low = VSIB.High; AddrAddressing.High = Invalid(); DestReg = Result.Low; ///< Start mixing with the low register. MaskReg = Mask.Low; ///< Mask starts with the low mask here. IndexElementOffset = 0; DataElementOffset = IR::NumElements(OpSize::i128Bit, ElementLoadSize) / 2; } ///< Calculate the high-half. auto ResultHigh = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, DestReg, MaskReg, BaseAddr, AddrAddressing.Low, AddrAddressing.High, AddrElementSize, VSIB.Scale, DataElementOffset, IndexElementOffset, AddrSize); if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { // If we only fetched 128-bits worth of data then the upper-result is all zero. Result = AVX128_Zext(ResultHigh); } else { Result.High = ResultHigh; } } return Result; } OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherQPSImpl(OpcodeArgs, Ref Dest, Ref Mask, RefVSIB VSIB) { ///< BaseAddr doesn't need to exist, calculate that here. Ref BaseAddr = VSIB.BaseAddr; if (BaseAddr && VSIB.Displacement) { BaseAddr = Add(OpSize::i64Bit, BaseAddr, VSIB.Displacement); } else if (VSIB.Displacement) { BaseAddr = Constant(VSIB.Displacement); } else if (!BaseAddr) { BaseAddr = Invalid(); } bool NeedsSVEScale = (VSIB.Scale == 2 || VSIB.Scale == 8) || (BaseAddr == Invalid() && VSIB.Scale != 1); if (CTX->HostFeatures.SupportsSVE128 && NeedsSVEScale) { // SVE gather instructions don't support scaling their vector elements by anything other than 1 or the address element size. // Pre-scale 64-bit addresses in the case that scale doesn't match in-order to hit SVE code paths more frequently. // Only hit this path if the host supports SVE. Otherwise it's a degradation for the ASIMD codepath. VSIB.Low = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale)); if (VSIB.High != Invalid()) { VSIB.High = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.High, FEXCore::ilog2(VSIB.Scale)); } ///< Set the scale to one now that it has been prescaled. VSIB.Scale = 1; } RefPair Result {}; const auto GPRSize = GetGPROpSize(); auto AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize; ///< Calculate the low-half. Result.Low = _VLoadVectorGatherMaskedQPS(OpSize::i128Bit, OpSize::i32Bit, Dest, Mask, BaseAddr, VSIB.Low, VSIB.High, VSIB.Scale, AddrSize); Result.High = LoadZeroVector(OpSize::i128Bit); if (VSIB.High == Invalid()) { // Special case for only loading two floats. // The upper 64-bits of the lower lane also gets zero. Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High); } return Result; } void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs, OpSize AddrElementSize) { const auto Size = OpSizeFromDst(Op); const auto Is128Bit = Size == OpSize::i128Bit; ///< Element size is determined by W flag. const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; // We only need the high address register if the number of data elements is more than what the low half can consume. // But also the number of address elements is clamped by the destination size as well. const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize); const size_t NumAddrElementBytes = std::min(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize))); const bool NeedsHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit); auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit); auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, NeedsHighAddrBytes); auto Mask = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit); bool NeedsSVEScale = (VSIB.Scale == 2 || VSIB.Scale == 8) || (VSIB.BaseAddr == Invalid() && VSIB.Scale != 1); const bool NeedsExplicitSVEPath = CTX->HostFeatures.SupportsSVE128 && AddrElementSize == OpSize::i32Bit && ElementLoadSize == OpSize::i32Bit && NeedsSVEScale; RefPair Result {}; if (NeedsExplicitSVEPath) { // Special case for VGATHERDPS/VPGATHERDD (32-bit addresses loading 32-bit elements) that can't use the SVE codepath. // The problem is due to the scale not matching SVE limitations, we need to prescale the addresses to be 64-bit. auto ScaleVSIBHalf = [this](Ref VSIB, Ref BaseAddr, int32_t Displacement, uint8_t Scale) -> RefVSIB { RefVSIB Result {}; Result.High = _VSSHLL2(OpSize::i128Bit, OpSize::i32Bit, VSIB, FEXCore::ilog2(Scale)); Result.Low = _VSSHLL(OpSize::i128Bit, OpSize::i32Bit, VSIB, FEXCore::ilog2(Scale)); Result.Displacement = Displacement; Result.BaseAddr = BaseAddr; ///< Set the scale to one now that it has been prescaled as well. Result.Scale = 1; return Result; }; RefVSIB VSIBLow = ScaleVSIBHalf(VSIB.Low, VSIB.BaseAddr, VSIB.Displacement, VSIB.Scale); RefVSIB VSIBHigh {}; if (NeedsHighAddrBytes) { VSIBHigh = ScaleVSIBHalf(VSIB.High, VSIB.BaseAddr, VSIB.Displacement, VSIB.Scale); } ///< AddressElementSize is now OpSize::i64Bit Result = AVX128_VPGatherQPSImpl(Op, Dest.Low, Mask.Low, VSIBLow); if (NeedsHighAddrBytes) { auto Res = AVX128_VPGatherQPSImpl(Op, Dest.High, Mask.High, VSIBHigh); Result.High = Res.Low; } } else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { Result = AVX128_VPGatherQPSImpl(Op, Dest.Low, Mask.Low, VSIB); } else { Result = AVX128_VPGatherImpl(Op, Size, ElementLoadSize, AddrElementSize, Dest, Mask, VSIB); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); ///< Assume non-faulting behaviour and clear the mask register. RefPair ZeroPair {}; ZeroPair.Low = LoadZeroVector(OpSize::i128Bit); ZeroPair.High = ZeroPair.Low; AVX128_StoreResult_WithOpSize(Op, Op->Src[1], ZeroPair); } void OpDispatchBuilder::AVX128_VCVTPH2PS(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(DstSize) / 2); const auto Is128BitSrc = SrcSize == OpSize::i128Bit; const auto Is128BitDst = DstSize == OpSize::i128Bit; RefPair Src {}; if (Op->Src[0].IsGPR()) { Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128BitSrc); } else { // In the event that a memory operand is used as the source operand, // the access width will always be half the size of the destination vector width // (i.e. 128-bit vector -> 64-bit mem, 256-bit vector -> 128-bit mem) Src.Low = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); } RefPair Result {}; Result.Low = _Vector_FToF(OpSize::i128Bit, OpSize::i32Bit, Src.Low, OpSize::i16Bit); if (Is128BitSrc) { Result.High = _VFCVTL2(OpSize::i128Bit, OpSize::i16Bit, Src.Low); } if (Is128BitDst) { Result = AVX128_Zext(Result.Low); } AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } void OpDispatchBuilder::AVX128_VCVTPS2PH(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is128BitSrc = SrcSize == OpSize::i128Bit; const auto StoreSize = Op->Dest.IsGPR() ? OpSize::i128Bit : IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2); const auto Imm8 = Op->Src[1].Literal(); const auto UseMXCSR = (Imm8 & 0b100) != 0; auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128BitSrc); RefPair Result {}; Ref OldFPCR {}; if (!UseMXCSR) { // No ARM float conversion instructions allow passing in // a rounding mode as an immediate. All of them depend on // the RM field in the FPCR. And so! We have to do some ugly // rounding mode shuffling. const auto NewRMode = Imm8 & 0b11; OldFPCR = _PushRoundingMode(NewRMode); } Result.Low = _Vector_FToF(OpSize::i128Bit, OpSize::i16Bit, Src.Low, OpSize::i32Bit); if (!Is128BitSrc) { Result.Low = _VFCVTN2(OpSize::i128Bit, OpSize::i32Bit, Result.Low, Src.High); } if (!UseMXCSR) { _PopRoundingMode(OldFPCR); } // We need to eliminate upper junk if we're storing into a register with // a 256-bit source (VCVTPS2PH's destination for registers is an XMM). if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) { Result = AVX128_Zext(Result.Low); } if (!Op->Dest.IsGPR()) { StoreResultFPR_WithOpSize(Op, Op->Dest, Result.Low, StoreSize); } else { AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/BaseTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { constexpr inline DispatchTableEntry OpDispatch_BaseOpTable[] = { // Instructions {0x00, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ALUOp, FEXCore::IR::IROps::OP_ADD, FEXCore::IR::IROps::OP_ATOMICFETCHADD, 0>}, {0x08, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ALUOp, FEXCore::IR::IROps::OP_OR, FEXCore::IR::IROps::OP_ATOMICFETCHOR, 0>}, {0x10, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ADCOp, 0>}, {0x18, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SBBOp, 0>}, {0x20, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ALUOp, FEXCore::IR::IROps::OP_ANDWITHFLAGS, FEXCore::IR::IROps::OP_ATOMICFETCHAND, 0>}, {0x28, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ALUOp, FEXCore::IR::IROps::OP_SUB, FEXCore::IR::IROps::OP_ATOMICFETCHSUB, 0>}, {0x30, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ALUOp, FEXCore::IR::IROps::OP_XOR, FEXCore::IR::IROps::OP_ATOMICFETCHXOR, 0>}, {0x38, 6, &OpDispatchBuilder::Bind<&OpDispatchBuilder::CMPOp, 0>}, {0x50, 8, &OpDispatchBuilder::PUSHREGOp}, {0x58, 8, &OpDispatchBuilder::POPOp}, {0x68, 1, &OpDispatchBuilder::PUSHOp}, {0x69, 1, &OpDispatchBuilder::IMUL2SrcOp}, {0x6A, 1, &OpDispatchBuilder::PUSHOp}, {0x6B, 1, &OpDispatchBuilder::IMUL2SrcOp}, {0x6C, 4, &OpDispatchBuilder::PermissionRestrictedOp}, {0x70, 16, &OpDispatchBuilder::CondJUMPOp}, {0x84, 2, &OpDispatchBuilder::Bind<&OpDispatchBuilder::TESTOp, 0>}, {0x86, 2, &OpDispatchBuilder::XCHGOp}, {0x88, 4, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVGPROp, 0>}, {0x8C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVSegOp, false>}, {0x8D, 1, &OpDispatchBuilder::LEAOp}, {0x8E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVSegOp, true>}, {0x8F, 1, &OpDispatchBuilder::POPOp}, {0x90, 8, &OpDispatchBuilder::XCHGOp}, {0x98, 1, &OpDispatchBuilder::CDQOp}, {0x99, 1, &OpDispatchBuilder::CQOOp}, {0x9B, 1, &OpDispatchBuilder::NOPOp}, {0x9C, 1, &OpDispatchBuilder::PUSHFOp}, {0x9D, 1, &OpDispatchBuilder::POPFOp}, {0x9E, 1, &OpDispatchBuilder::SAHFOp}, {0x9F, 1, &OpDispatchBuilder::LAHFOp}, {0xA4, 2, &OpDispatchBuilder::MOVSOp}, {0xA6, 2, &OpDispatchBuilder::CMPSOp}, {0xA8, 2, &OpDispatchBuilder::Bind<&OpDispatchBuilder::TESTOp, 0>}, {0xAA, 2, &OpDispatchBuilder::STOSOp}, {0xAC, 2, &OpDispatchBuilder::LODSOp}, {0xAE, 2, &OpDispatchBuilder::SCASOp}, {0xB0, 16, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVGPROp, 0>}, {0xC2, 2, &OpDispatchBuilder::RETOp}, {0xC8, 1, &OpDispatchBuilder::EnterOp}, {0xC9, 1, &OpDispatchBuilder::LEAVEOp}, {0xCA, 2, &OpDispatchBuilder::RETFARIndirectOp}, {0xCC, 2, &OpDispatchBuilder::INTOp}, {0xCF, 1, &OpDispatchBuilder::IRETOp}, {0xD7, 2, &OpDispatchBuilder::XLATOp}, {0xE0, 3, &OpDispatchBuilder::LoopOp}, {0xE3, 1, &OpDispatchBuilder::CondJUMPRCXOp}, {0xE4, 4, &OpDispatchBuilder::PermissionRestrictedOp}, {0xE8, 1, &OpDispatchBuilder::CALLOp}, {0xE9, 1, &OpDispatchBuilder::JUMPOp}, {0xEB, 1, &OpDispatchBuilder::JUMPOp}, {0xEC, 4, &OpDispatchBuilder::PermissionRestrictedOp}, {0xF1, 1, &OpDispatchBuilder::INTOp}, {0xF4, 1, &OpDispatchBuilder::INTOp}, {0xF5, 1, &OpDispatchBuilder::FLAGControlOp}, {0xF8, 2, &OpDispatchBuilder::FLAGControlOp}, {0xFA, 2, &OpDispatchBuilder::PermissionRestrictedOp}, {0xFC, 2, &OpDispatchBuilder::FLAGControlOp}, }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 Crypto instructions to IR $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include #include "Interface/Core/OpcodeDispatcher.h" #include namespace FEXCore::IR { class OrderedNode; #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op void OpDispatchBuilder::SHA1NEXTEOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // ARMv8 SHA1 extension provides a `SHA1H` instruction which does a fixed rotate by 30. // This only operates on element 0 rather than element 3. We don't have the luxury of rewriting the x86 SHA algorithm to take advantage of this. // Move the element to zero, rotate, and then move back (Using duplicates). // Saves one instruction versus that path that doesn't support SHA extension. auto Duplicated = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Dest, 3); auto Sha1HRotated = _VSha1H(Duplicated); auto RotatedNode = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Sha1HRotated, 0); auto Tmp = _VAdd(OpSize::i128Bit, OpSize::i32Bit, Src, RotatedNode); auto Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 3, 3, Src, Tmp); StoreResultFPR(Op, Result); } void OpDispatchBuilder::SHA1MSG1Op(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref NewVec = _VExtr(OpSize::i128Bit, OpSize::i64Bit, Dest, Src, 1); // [W0, W1, W2, W3] ^ [W2, W3, W4, W5] Ref Result = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, NewVec); StoreResultFPR(Op, Result); } void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0. auto Src1 = SHADataShuffle(Dest); auto Src2 = SHADataShuffle(Src); // The result is swizzled differently than expected auto Result = SHADataShuffle(_VSha1SU1(Src1, Src2)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::SHA1RNDS4Op(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } const uint64_t Imm8 = Op->Src[1].Literal() & 0b11; Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result {}; Ref ConstantVector {}; switch (Imm8) { case 0: ConstantVector = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K0); break; case 1: ConstantVector = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K1); break; case 2: ConstantVector = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K2); break; case 3: ConstantVector = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K3); break; } const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit); Ref Src1 = SHADataShuffle(Dest); Ref Src2 = SHADataShuffle(Src); Src2 = _VAdd(OpSize::i128Bit, OpSize::i32Bit, Src2, ConstantVector); switch (Imm8) { case 0: Result = SHADataShuffle(_VSha1C(Src1, ZeroRegister, Src2)); break; case 2: Result = SHADataShuffle(_VSha1M(Src1, ZeroRegister, Src2)); break; case 1: case 3: Result = SHADataShuffle(_VSha1P(Src1, ZeroRegister, Src2)); break; } StoreResultFPR(Op, Result); } void OpDispatchBuilder::SHA256MSG1Op(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto Result = _VSha256U0(Dest, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::SHA256MSG2Op(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto Src1 = _VExtr(OpSize::i128Bit, OpSize::i32Bit, Dest, Dest, 3); auto DupDst = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Dest, 3); auto Src2 = _VZip2(OpSize::i128Bit, OpSize::i64Bit, DupDst, Src); auto Result = _VSha256U1(Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::SHA256RNDS2Op(OpcodeArgs) { if (!CTX->HostFeatures.SupportsSHA) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // Hardcoded to XMM0 auto XMM0 = LoadXMMRegister(0); auto shuffle_abcd = [this](Ref Src1, Ref Src2) -> Ref { // Generates a suitable SHA256 `abcd` configuration from x86 format. auto Tmp = _VZip2(OpSize::i128Bit, OpSize::i64Bit, Src2, Src1); return _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); }; auto shuffle_efgh = [this](Ref Src1, Ref Src2) -> Ref { // Generates a suitable SHA256 `efgh` configuration from x86 format. auto Tmp = _VZip(OpSize::i128Bit, OpSize::i64Bit, Src2, Src1); return _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); }; auto ABCD = shuffle_abcd(Dest, Src); auto EFGH = shuffle_efgh(Dest, Src); // x86 uses only the bottom 64-bits of the key, so duplicate to match ARM64 semantics. auto Key = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, XMM0, 0); auto A = _VSha256H(ABCD, EFGH, Key); auto B = _VSha256H2(EFGH, ABCD, Key); auto Result = shuffle_abcd(A, B); StoreResultFPR(Op, Result); } void OpDispatchBuilder::AESImcOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsAES) { UnimplementedOp(Op); return; } Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VAESImc(Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::AESEncOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsAES) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VAESEnc(OpSize::i128Bit, Dest, Src, LoadZeroVector(OpSize::i128Bit)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VAESEncOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESENC. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENC unimplemented"); Ref State = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Key = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VAESEnc(DstSize, State, Key, LoadZeroVector(DstSize)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::AESEncLastOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsAES) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VAESEncLast(OpSize::i128Bit, Dest, Src, LoadZeroVector(OpSize::i128Bit)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VAESEncLastOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESENCLAST. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESENCLAST unimplemented"); Ref State = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Key = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VAESEncLast(DstSize, State, Key, LoadZeroVector(DstSize)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::AESDecOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsAES) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VAESDec(OpSize::i128Bit, Dest, Src, LoadZeroVector(OpSize::i128Bit)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VAESDecOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESDEC. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDEC unimplemented"); Ref State = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Key = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VAESDec(DstSize, State, Key, LoadZeroVector(DstSize)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::AESDecLastOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsAES) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VAESDecLast(OpSize::i128Bit, Dest, Src, LoadZeroVector(OpSize::i128Bit)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VAESDecLastOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is128Bit = DstSize == OpSize::i128Bit; // TODO: Handle 256-bit VAESDECLAST. LOGMAN_THROW_A_FMT(Is128Bit, "256-bit VAESDECLAST unimplemented"); Ref State = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Key = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VAESDecLast(DstSize, State, Key, LoadZeroVector(DstSize)); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::AESKeyGenAssistImpl(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const uint64_t RCON = Op->Src[1].Literal(); auto KeyGenSwizzle = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NAMED_VECTOR_AESKEYGENASSIST_SWIZZLE); return _VAESKeyGenAssist(Src, KeyGenSwizzle, LoadZeroVector(OpSize::i128Bit), RCON); } void OpDispatchBuilder::AESKeyGenAssist(OpcodeArgs) { if (!CTX->HostFeatures.SupportsAES) { UnimplementedOp(Op); return; } Ref Result = AESKeyGenAssistImpl(Op); StoreResultFPR(Op, Result); } void OpDispatchBuilder::PCLMULQDQOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsPMULL_128Bit) { UnimplementedOp(Op); return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const auto Selector = static_cast(Op->Src[1].Literal()); auto Res = _PCLMUL(OpSize::i128Bit, Dest, Src, Selector & 0b1'0001); StoreResultFPR(Op, Res); } void OpDispatchBuilder::VPCLMULQDQOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsPMULL_128Bit) { UnimplementedOp(Op); return; } const auto DstSize = OpSizeFromDst(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); const auto Selector = static_cast(Op->Src[2].Literal()); Ref Res = _PCLMUL(DstSize, Src1, Src2, Selector & 0b1'0001); StoreResultFPR(Op, Res); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { constexpr DispatchTableEntry OpDispatch_DDDTable[] = { {0x0C, 1, &OpDispatchBuilder::PI2FWOp}, {0x0D, 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, {0x1C, 1, &OpDispatchBuilder::PF2IWOp}, {0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECPPRECISION, OpSize::i32Bit>}, {0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, false>}, {0x8A, 1, &OpDispatchBuilder::PFNACCOp}, {0x8E, 1, &OpDispatchBuilder::PFPNACCOp}, {0x90, 1, &OpDispatchBuilder::VPFCMPOp<1>}, {0x94, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>}, {0x96, 1, &OpDispatchBuilder::VectorUnaryDuplicateOp}, {0x97, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RSqrt3DNowOp, true>}, {0x9A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>}, {0x9E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i32Bit>}, {0xA0, 1, &OpDispatchBuilder::VPFCMPOp<2>}, {0xA4, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMAX, OpSize::i32Bit>}, // Can be treated as a move {0xA6, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0xA7, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0xAA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUROp, IR::OP_VFSUB, OpSize::i32Bit>}, {0xAE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADDP, OpSize::i32Bit>}, {0xB0, 1, &OpDispatchBuilder::VPFCMPOp<0>}, {0xB4, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMUL, OpSize::i32Bit>}, // Can be treated as a move {0xB6, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0xB7, 1, &OpDispatchBuilder::PMULHRWOp}, {0xBB, 1, &OpDispatchBuilder::PSWAPDOp}, {0xBF, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VURAVG, OpSize::i8Bit>}, }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 flag generation $end_info$ */ #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include #include #include #include #include namespace FEXCore::IR { constexpr std::array FlagOffsets = { FEXCore::X86State::RFLAG_CF_RAW_LOC, FEXCore::X86State::RFLAG_PF_RAW_LOC, FEXCore::X86State::RFLAG_AF_RAW_LOC, FEXCore::X86State::RFLAG_ZF_RAW_LOC, FEXCore::X86State::RFLAG_SF_RAW_LOC, FEXCore::X86State::RFLAG_TF_RAW_LOC, FEXCore::X86State::RFLAG_IF_LOC, FEXCore::X86State::RFLAG_DF_RAW_LOC, FEXCore::X86State::RFLAG_OF_RAW_LOC, FEXCore::X86State::RFLAG_IOPL_LOC, FEXCore::X86State::RFLAG_NT_LOC, FEXCore::X86State::RFLAG_RF_LOC, FEXCore::X86State::RFLAG_VM_LOC, FEXCore::X86State::RFLAG_AC_LOC, FEXCore::X86State::RFLAG_VIF_LOC, FEXCore::X86State::RFLAG_VIP_LOC, FEXCore::X86State::RFLAG_ID_LOC, }; void OpDispatchBuilder::ZeroPF_AF() { // PF is stored inverted, so invert it when we zero. SetRFLAG(Constant(1)); SetAF(0); } void OpDispatchBuilder::SetPackedRFLAG(bool Lower8, Ref Src) { size_t NumFlags = FlagOffsets.size(); if (Lower8) { // Calculate flags early. // This is only a partial overwrite of flags since OF isn't stored here. CalculateDeferredFlags(); NumFlags = 5; } // PF and CF are both stored inverted, so hoist the invert. auto SrcInverted = _Not(OpSize::i32Bit, Src); for (size_t i = 0; i < NumFlags; ++i) { const auto FlagOffset = FlagOffsets[i]; if (FlagOffset == FEXCore::X86State::RFLAG_AF_RAW_LOC) { // AF is in bit 4 architecturally, and we need to store it to bit 4 of our // AF register, with garbage in the other bits. The extract is deferred. // We also defer a XOR with the result bit, which is implemented as XOR // with PF[4]. But the _Bfe below reliably zeros bit 4 of the PF byte, so // that will be a no-op and we get the right result. // // So we write out the whole flags byte to AF without an extract. static_assert(FEXCore::X86State::RFLAG_AF_RAW_LOC == 4); SetRFLAG(Src, FEXCore::X86State::RFLAG_AF_RAW_LOC); } else if (FlagOffset == FEXCore::X86State::RFLAG_PF_RAW_LOC || FlagOffset == FEXCore::X86State::RFLAG_CF_RAW_LOC) { // PF and CF are both stored parity flipped. SetRFLAG(SrcInverted, FlagOffset, FlagOffset, true); } else { SetRFLAG(Src, FlagOffset, FlagOffset, true); } } CFInverted = true; } Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) { // Calculate flags early. CalculateDeferredFlags(); // SF/ZF and N/Z are together on both arm64 and x86_64, so we special case that. bool GetNZ = (FlagsMask & (1 << FEXCore::X86State::RFLAG_SF_RAW_LOC)) && (FlagsMask & (1 << FEXCore::X86State::RFLAG_ZF_RAW_LOC)); // Handle CF first, since it's at bit 0 and hence doesn't need shift or OR. LOGMAN_THROW_A_FMT(FlagsMask & (1 << FEXCore::X86State::RFLAG_CF_RAW_LOC), "CF always handled"); static_assert(FEXCore::X86State::RFLAG_CF_RAW_LOC == 0); Ref Original = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); for (size_t i = 0; i < FlagOffsets.size(); ++i) { const auto FlagOffset = FlagOffsets[i]; if (!((1U << FlagOffset) & FlagsMask)) { continue; } if ((GetNZ && (FlagOffset == FEXCore::X86State::RFLAG_SF_RAW_LOC || FlagOffset == FEXCore::X86State::RFLAG_ZF_RAW_LOC)) || FlagOffset == FEXCore::X86State::RFLAG_CF_RAW_LOC || FlagOffset == FEXCore::X86State::RFLAG_PF_RAW_LOC) { // Already handled continue; } // Note that the Bfi only considers the bottom bit of the flag, the rest of // the byte is allowed to be garbage. Ref Flag; if (FlagOffset == FEXCore::X86State::RFLAG_AF_RAW_LOC) { Flag = LoadAF(); } else { Flag = GetRFLAG(FlagOffset); } Original = _Orlshl(OpSize::i64Bit, Original, Flag, FlagOffset); } // Raw PF value needs to have its bottom bit masked out and inverted. The // naive sequence is and/eor/orlshl. But we can do the inversion implicitly // instead. if (FlagsMask & (1 << FEXCore::X86State::RFLAG_PF_RAW_LOC)) { // Set every bit except the bottommost. auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false, false), _InlineConstant(~1ull)); // Rotate the bottom bit to the appropriate location for PF, so we get // something like 111P1111. Then invert that to get 000p0000. Then OR that // into the flags. This is 1 A64 instruction :-) auto RightRotation = 64 - FEXCore::X86State::RFLAG_PF_RAW_LOC; Original = _Ornror(OpSize::i64Bit, Original, OnesInvPF, RightRotation); } // OR in the SF/ZF flags at the end, allowing the lshr to fold with the OR if (GetNZ) { static_assert(FEXCore::X86State::RFLAG_SF_RAW_LOC == (FEXCore::X86State::RFLAG_ZF_RAW_LOC + 1)); auto NZCV = GetNZCV(); auto NZ = _And(OpSize::i64Bit, NZCV, _InlineConstant(0b11u << 30)); Original = _Orlshr(OpSize::i64Bit, Original, NZ, 31 - FEXCore::X86State::RFLAG_SF_RAW_LOC); } // The constant is OR'ed in at the end, to avoid a pointless or xzr, #2. if ((1U << X86State::RFLAG_RESERVED_LOC) & FlagsMask) { Original = _Or(OpSize::i64Bit, Original, _InlineConstant(2)); } return Original; } void OpDispatchBuilder::CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref Src2, bool Sub) { LOGMAN_THROW_A_FMT(SrcSize >= IR::OpSize::i8Bit && SrcSize <= IR::OpSize::i64Bit, "Invalid size"); const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; const uint64_t SignBit = IR::OpSizeAsBits(SrcSize) - 1; Ref Anded = nullptr; // For add, OF is set iff the sources have the same sign but the destination // sign differs. If we know a source sign, we can simplify the expression: if // source 2 is known to be positive, we set OF if source 1 is positive and // source 2 is negative. Similarly if source 2 is known negative. // // For sub, OF is set iff the sources have differing signs and the destination // sign matches the second source. If source 2 is known positive, set iff // source 1 negative and source 2 positive. uint64_t Const; if (IsValueConstant(WrapNode(Src2), &Const)) { bool Negative = (Const & (1ull << SignBit)) != 0; if (Negative ^ Sub) { Anded = _Andn(OpSize, Src1, Res); } else { Anded = _Andn(OpSize, Res, Src1); } } else { auto XorOp1 = _Xor(OpSize, Src1, Src2); auto XorOp2 = _Xor(OpSize, Res, Src1); if (Sub) { Anded = _And(OpSize, XorOp2, XorOp1); } else { Anded = _Andn(OpSize, XorOp2, XorOp1); } } SetRFLAG(Anded, SignBit, true); } Ref OpDispatchBuilder::LoadPFRaw(bool Mask, bool Invert) { // Most blocks do not read parity, so PF optimization is gated on this flag. CurrentHeader->ReadsParity = true; // Evaluate parity on the deferred raw value. return _Parity(GetRFLAG(FEXCore::X86State::RFLAG_PF_RAW_LOC), Mask, Invert); } Ref OpDispatchBuilder::LoadAF() { // Read the stored value. This is the XOR of the arguments. auto AFWord = GetRFLAG(FEXCore::X86State::RFLAG_AF_RAW_LOC); // Read the result, stored for PF. auto Result = GetRFLAG(FEXCore::X86State::RFLAG_PF_RAW_LOC); // What's left is to XOR and extract. This is the deferred part. We // specifically use a 64-bit Xor here as we don't need masking. return _Bfe(OpSize::i32Bit, 1, 4, _Xor(OpSize::i64Bit, AFWord, Result)); } void OpDispatchBuilder::FixupAF() { // The caller has set a desired value of AF in AF[4], regardless of the value // of PF. We need to fixup AF[4] so that we get the right value when we XOR in // PF[4] later. The easiest solution is to XOR by PF[4], since: // // (AF[4] ^ PF[4]) ^ PF[4] = AF[4] auto PFRaw = GetRFLAG(FEXCore::X86State::RFLAG_PF_RAW_LOC); auto AFRaw = GetRFLAG(FEXCore::X86State::RFLAG_AF_RAW_LOC); // Again 64-bit as masking is more expensive. Ref XorRes = _Xor(OpSize::i64Bit, AFRaw, PFRaw); SetRFLAG(XorRes); } void OpDispatchBuilder::SetAFAndFixup(Ref AF) { // We have a value of AF, we shift into AF[4]. We need to fixup AF[4] so that // we get the right value when we XOR in PF[4] later. The easiest solution is // to XOR by PF[4], since: // // (AF[4] ^ PF[4]) ^ PF[4] = AF[4] auto PFRaw = GetRFLAG(FEXCore::X86State::RFLAG_PF_RAW_LOC); Ref XorRes = _XorShift(OpSize::i32Bit, PFRaw, AF, ShiftType::LSL, 4); SetRFLAG(XorRes); } void OpDispatchBuilder::CalculatePF(Ref Res) { // Calculation is entirely deferred until load, just store the 8-bit result. SetRFLAG(Res); } void OpDispatchBuilder::CalculateAF(Ref Src1, Ref Src2) { // We only care about bit 4 in the subsequent XOR. If we'll XOR with 0, // there's no sense XOR'ing at all. If we'll XOR with 1, that's just // inverting. for (unsigned i = 0; i < 2; ++i) { Ref SrcA = i ? Src1 : Src2; Ref SrcB = i ? Src2 : Src1; uint64_t Const; if (IsValueConstant(WrapNode(SrcA), &Const)) { if (Const & (1u << 4)) { SetRFLAG(_Not(OpSize::i32Bit, SrcB)); } else { SetRFLAG(SrcB); } return; } } // We store the XOR of the arguments. At read time, we XOR with the // appropriate bit of the result (available as the PF flag) and extract the // appropriate bit. Again 64-bit to avoid masking. Ref XorRes = Src1 == Src2 ? Constant(0) : _Xor(OpSize::i64Bit, Src1, Src2); SetRFLAG(XorRes); } void OpDispatchBuilder::CalculateDeferredFlags() { if (NZCVDirty && CachedNZCV) { _StoreNZCV(CachedNZCV); } CachedNZCV = nullptr; NZCVDirty = false; } Ref OpDispatchBuilder::IncrementByCarry(OpSize OpSize, Ref Src) { // If CF not inverted, we use .cc since the increment happens when the // condition is false. If CF inverted, invert to use .cs. A bit mindbendy. return _NZCVSelectIncrement(OpSize, CFInverted ? CondClass::UGE : CondClass::ULT, Src, Src); } Ref OpDispatchBuilder::CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2) { auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; Ref Res; CalculateAF(Src1, Src2); if (SrcSize >= OpSize::i32Bit) { RectifyCarryInvert(false); HandleNZCV_RMW(); Res = _AdcWithFlags(OpSize, Src1, Src2); CFInverted = false; } else { // Need to zero-extend for correct comparisons below Src2 = ARef(Src2).Bfe(0, IR::OpSizeAsBits(SrcSize)).Ref(); // Note that we do not extend Src2PlusCF, since we depend on proper // 32-bit arithmetic to correctly handle the Src2 = 0xffff case. Ref Src2PlusCF = IncrementByCarry(OpSize, Src2); // Need to zero-extend for the comparison. Res = Add(OpSize, Src1, Src2PlusCF); Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res); // TODO: We can fold that second Bfe in (cmp uxth). auto SelectCFInv = Select01(OpSize, CondClass::UGE, Res, Src2PlusCF); SetNZ_ZeroCV(SrcSize, Res); SetCFInverted(SelectCFInv); CalculateOF(SrcSize, Res, Src1, Src2, false); } CalculatePF(Res); return Res; } Ref OpDispatchBuilder::CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2) { auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; CalculateAF(Src1, Src2); Ref Res; if (SrcSize >= OpSize::i32Bit) { // Arm's subtraction has inverted CF from x86, so rectify the input and // invert the output. RectifyCarryInvert(true); HandleNZCV_RMW(); Res = _SbbWithFlags(OpSize, Src1, Src2); CFInverted = true; } else { // Zero extend for correct comparison behaviour with Src1 = 0xffff. Src1 = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Src1); Src2 = ARef(Src2).Bfe(0, IR::OpSizeAsBits(SrcSize)).Ref(); auto Src2PlusCF = IncrementByCarry(OpSize, Src2); Res = Sub(OpSize, Src1, Src2PlusCF); Res = _Bfe(OpSize, IR::OpSizeAsBits(SrcSize), 0, Res); auto SelectCFInv = Select01(OpSize, CondClass::UGE, Src1, Src2PlusCF); SetNZ_ZeroCV(SrcSize, Res); SetCFInverted(SelectCFInv); CalculateOF(SrcSize, Res, Src1, Src2, true); } CalculatePF(Res); return Res; } Ref OpDispatchBuilder::CalculateFlags_SUB(IR::OpSize SrcSize, Ref Src1, Ref Src2, bool UpdateCF) { // Stash CF before stomping over it auto OldCFInv = UpdateCF ? nullptr : GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC, true); HandleNZCVWrite(); CalculateAF(Src1, Src2); Ref Res; if (SrcSize >= OpSize::i32Bit) { Res = SubWithFlags(SrcSize, Src1, Src2); } else { _SubNZCV(SrcSize, Src1, Src2); Res = Sub(OpSize::i32Bit, Src1, Src2); } CalculatePF(Res); // If we're updating CF, we need it to be inverted because SubNZCV is inverted // from x86. If we're not updating CF, we need to restore the CF since we // stomped over it. if (UpdateCF) { CFInverted = true; } else { SetCFInverted(OldCFInv); } return Res; } Ref OpDispatchBuilder::CalculateFlags_ADD(IR::OpSize SrcSize, Ref Src1, Ref Src2, bool UpdateCF) { // Stash CF before stomping over it auto OldCFInv = UpdateCF ? nullptr : GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC, true); HandleNZCVWrite(); CalculateAF(Src1, Src2); Ref Res; if (SrcSize >= OpSize::i32Bit) { Res = AddWithFlags(SrcSize, Src1, Src2); } else { _AddNZCV(SrcSize, Src1, Src2); Res = Add(OpSize::i32Bit, Src1, Src2); } CalculatePF(Res); // We stomped over CF while calculation flags, restore it. if (UpdateCF) { // Adds match between x86 and arm64. CFInverted = false; } else { SetCFInverted(OldCFInv); } return Res; } void OpDispatchBuilder::CalculateFlags_MUL(IR::OpSize SrcSize, Ref Res, Ref High) { HandleNZCVWrite(); InvalidatePF_AF(); // CF and OF are set if the result of the operation can't be fit in to the destination register // If the value can fit then the top bits will be zero auto SignBit = _Sbfe(OpSize::i64Bit, 1, IR::OpSizeAsBits(SrcSize) - 1, Res); _SubNZCV(OpSize::i64Bit, High, SignBit); // If High = SignBit, then sets to nZCv. Else sets to nzcV. Since SF/ZF // undefined, this does what we need after inverting carry. auto Zero = _InlineConstant(0); _CondSubNZCV(OpSize::i64Bit, Zero, Zero, CondClass::EQ, 0x1 /* nzcV */); CFInverted = true; } void OpDispatchBuilder::CalculateFlags_UMUL(Ref High) { HandleNZCVWrite(); InvalidatePF_AF(); auto Zero = _InlineConstant(0); const auto Size = GetOpSize(High); // CF and OF are set if the result of the operation can't be fit in to the destination register // The result register will be all zero if it can't fit due to how multiplication behaves _SubNZCV(Size, High, Zero); // If High = 0, then sets to nZCv. Else sets to nzcV. Since SF/ZF undefined, // this does what we need. _CondSubNZCV(Size, Zero, Zero, CondClass::EQ, 0x1 /* nzcV */); CFInverted = true; } void OpDispatchBuilder::CalculateFlags_Logical(IR::OpSize SrcSize, Ref Res) { InvalidateAF(); SetNZP_ZeroCV(SrcSize, Res); } void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Ref UnmaskedRes, Ref Src1, uint64_t Shift) { // No flags changed if shift is zero if (Shift == 0) { return; } auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; SetNZ_ZeroCV(SrcSize, UnmaskedRes); // CF { // Extract the last bit shifted in to CF. Shift is already masked, but for // 8/16-bit it might be >= SrcSizeBits, in which case CF is cleared. There's // nothing to do in that case since we already cleared CF above. const auto SrcSizeBits = IR::OpSizeAsBits(SrcSize); if (Shift < SrcSizeBits) { SetCFDirect(Src1, SrcSizeBits - Shift, true); } } CalculatePF(UnmaskedRes); InvalidateAF(); // OF // In the case of left shift. OF is only set from the result of XOR if (Shift == 1) { auto Xor = _Xor(OpSize, UnmaskedRes, Src1); SetRFLAG(Xor, IR::OpSizeAsBits(SrcSize) - 1, true); } else { // Undefined, we choose to zero as part of SetNZ_ZeroCV } } void OpDispatchBuilder::CalculateFlags_SignShiftRightImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift) { // No flags changed if shift is zero if (Shift == 0) { return; } SetNZ_ZeroCV(SrcSize, Res); // Extract the last bit shifted in to CF SetCFDirect(Src1, Shift - 1, true); CalculatePF(Res); InvalidateAF(); // OF // Only defined when Shift is 1 else undefined. Only is set if the top bit was set to 1 when // shifted So it is set to zero. In the undefined case we choose to zero as well. Since it was // already zeroed there's nothing to do here. } void OpDispatchBuilder::CalculateFlags_ShiftRightImmediateCommon(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift) { // Set SF and PF. Clobbers OF, but OF only defined for Shift = 1 where it is // set below. SetNZ_ZeroCV(SrcSize, Res); // Extract the last bit shifted in to CF SetCFDirect(Src1, Shift - 1, true); CalculatePF(Res); InvalidateAF(); } void OpDispatchBuilder::CalculateFlags_ShiftRightImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift) { // No flags changed if shift is zero if (Shift == 0) { return; } CalculateFlags_ShiftRightImmediateCommon(SrcSize, Res, Src1, Shift); // OF { // Only defined when Shift is 1 else undefined // Is set to the MSB of the original value if (Shift == 1) { SetRFLAG(Src1, IR::OpSizeAsBits(SrcSize) - 1, true); } } } void OpDispatchBuilder::CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift) { // No flags changed if shift is zero if (Shift == 0) { return; } const auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; CalculateFlags_ShiftRightImmediateCommon(SrcSize, Res, Src1, Shift); // OF { // Only defined when Shift is 1 else undefined // Is set if the MSB bit changes. // XOR of Result and Src1 if (Shift == 1) { auto val = _Xor(OpSize, Src1, Res); SetRFLAG(val, IR::OpSizeAsBits(SrcSize) - 1, true); } } } void OpDispatchBuilder::CalculateFlags_ZCNT(IR::OpSize SrcSize, Ref Result) { // OF, SF, AF, PF all undefined // Test ZF of result, SF is undefined so this is ok. SetNZ_ZeroCV(SrcSize, Result); // Now set CF if the Result = SrcSize * 8. Since SrcSize is a power-of-two and // Result is <= SrcSize * 8, we equivalently check if the log2(SrcSize * 8) // bit is set. No masking is needed because no higher bits could be set. unsigned CarryBit = FEXCore::ilog2(IR::OpSizeAsBits(SrcSize)); SetCFDirect(Result, CarryBit); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/H0F38Tables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { #define OPD(prefix, opcode) (((prefix) << 8) | opcode) constexpr uint16_t PF_38_NONE = 0; constexpr uint16_t PF_38_66 = (1U << 0); constexpr uint16_t PF_38_F2 = (1U << 1); constexpr uint16_t PF_38_F3 = (1U << 2); constexpr DispatchTableEntry OpDispatch_H0F38Table[] = { {OPD(PF_38_NONE, 0x00), 1, &OpDispatchBuilder::PSHUFBOp}, {OPD(PF_38_66, 0x00), 1, &OpDispatchBuilder::PSHUFBOp}, {OPD(PF_38_NONE, 0x01), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADDP, OpSize::i16Bit>}, {OPD(PF_38_66, 0x01), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADDP, OpSize::i16Bit>}, {OPD(PF_38_NONE, 0x02), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADDP, OpSize::i32Bit>}, {OPD(PF_38_66, 0x02), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADDP, OpSize::i32Bit>}, {OPD(PF_38_NONE, 0x03), 1, &OpDispatchBuilder::PHADDS}, {OPD(PF_38_66, 0x03), 1, &OpDispatchBuilder::PHADDS}, {OPD(PF_38_NONE, 0x04), 1, &OpDispatchBuilder::PMADDUBSW}, {OPD(PF_38_66, 0x04), 1, &OpDispatchBuilder::PMADDUBSW}, {OPD(PF_38_NONE, 0x05), 1, &OpDispatchBuilder::PHSUB}, {OPD(PF_38_66, 0x05), 1, &OpDispatchBuilder::PHSUB}, {OPD(PF_38_NONE, 0x06), 1, &OpDispatchBuilder::PHSUB}, {OPD(PF_38_66, 0x06), 1, &OpDispatchBuilder::PHSUB}, {OPD(PF_38_NONE, 0x07), 1, &OpDispatchBuilder::PHSUBS}, {OPD(PF_38_66, 0x07), 1, &OpDispatchBuilder::PHSUBS}, {OPD(PF_38_NONE, 0x08), 1, &OpDispatchBuilder::PSIGN}, {OPD(PF_38_66, 0x08), 1, &OpDispatchBuilder::PSIGN}, {OPD(PF_38_NONE, 0x09), 1, &OpDispatchBuilder::PSIGN}, {OPD(PF_38_66, 0x09), 1, &OpDispatchBuilder::PSIGN}, {OPD(PF_38_NONE, 0x0A), 1, &OpDispatchBuilder::PSIGN}, {OPD(PF_38_66, 0x0A), 1, &OpDispatchBuilder::PSIGN}, {OPD(PF_38_NONE, 0x0B), 1, &OpDispatchBuilder::PMULHRSW}, {OPD(PF_38_66, 0x0B), 1, &OpDispatchBuilder::PMULHRSW}, {OPD(PF_38_66, 0x10), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorVariableBlend, OpSize::i8Bit>}, {OPD(PF_38_66, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorVariableBlend, OpSize::i32Bit>}, {OPD(PF_38_66, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorVariableBlend, OpSize::i64Bit>}, {OPD(PF_38_66, 0x17), 1, &OpDispatchBuilder::PTestOp}, {OPD(PF_38_NONE, 0x1C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VABS, OpSize::i8Bit>}, {OPD(PF_38_66, 0x1C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VABS, OpSize::i8Bit>}, {OPD(PF_38_NONE, 0x1D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VABS, OpSize::i16Bit>}, {OPD(PF_38_66, 0x1D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VABS, OpSize::i16Bit>}, {OPD(PF_38_NONE, 0x1E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VABS, OpSize::i32Bit>}, {OPD(PF_38_66, 0x1E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VABS, OpSize::i32Bit>}, {OPD(PF_38_66, 0x20), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x21), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x22), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x23), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x24), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x25), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x28), 1, &OpDispatchBuilder::PMULLOp}, {OPD(PF_38_66, 0x29), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i64Bit>}, {OPD(PF_38_66, 0x2A), 1, &OpDispatchBuilder::MOVVectorNTOp}, {OPD(PF_38_66, 0x2B), 1, &OpDispatchBuilder::PACKUSOp}, {OPD(PF_38_66, 0x30), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x31), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x32), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x33), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x34), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x35), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(PF_38_66, 0x37), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i64Bit>}, {OPD(PF_38_66, 0x38), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMIN, OpSize::i8Bit>}, {OPD(PF_38_66, 0x39), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMIN, OpSize::i32Bit>}, {OPD(PF_38_66, 0x3A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMIN, OpSize::i16Bit>}, {OPD(PF_38_66, 0x3B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMIN, OpSize::i32Bit>}, {OPD(PF_38_66, 0x3C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMAX, OpSize::i8Bit>}, {OPD(PF_38_66, 0x3D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMAX, OpSize::i32Bit>}, {OPD(PF_38_66, 0x3E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMAX, OpSize::i16Bit>}, {OPD(PF_38_66, 0x3F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMAX, OpSize::i32Bit>}, {OPD(PF_38_66, 0x40), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VMUL, OpSize::i32Bit>}, {OPD(PF_38_66, 0x41), 1, &OpDispatchBuilder::PHMINPOSUWOp}, {OPD(PF_38_NONE, 0xC8), 1, &OpDispatchBuilder::SHA1NEXTEOp}, {OPD(PF_38_NONE, 0xC9), 1, &OpDispatchBuilder::SHA1MSG1Op}, {OPD(PF_38_NONE, 0xCA), 1, &OpDispatchBuilder::SHA1MSG2Op}, {OPD(PF_38_NONE, 0xCB), 1, &OpDispatchBuilder::SHA256RNDS2Op}, {OPD(PF_38_NONE, 0xCC), 1, &OpDispatchBuilder::SHA256MSG1Op}, {OPD(PF_38_NONE, 0xCD), 1, &OpDispatchBuilder::SHA256MSG2Op}, {OPD(PF_38_66, 0xDB), 1, &OpDispatchBuilder::AESImcOp}, {OPD(PF_38_66, 0xDC), 1, &OpDispatchBuilder::AESEncOp}, {OPD(PF_38_66, 0xDD), 1, &OpDispatchBuilder::AESEncLastOp}, {OPD(PF_38_66, 0xDE), 1, &OpDispatchBuilder::AESDecOp}, {OPD(PF_38_66, 0xDF), 1, &OpDispatchBuilder::AESDecLastOp}, {OPD(PF_38_NONE, 0xF0), 2, &OpDispatchBuilder::MOVBEOp}, {OPD(PF_38_66, 0xF0), 2, &OpDispatchBuilder::MOVBEOp}, {OPD(PF_38_F2, 0xF0), 1, &OpDispatchBuilder::CRC32}, {OPD(PF_38_F2, 0xF1), 1, &OpDispatchBuilder::CRC32}, {OPD(PF_38_66 | PF_38_F2, 0xF0), 1, &OpDispatchBuilder::CRC32}, {OPD(PF_38_66 | PF_38_F2, 0xF1), 1, &OpDispatchBuilder::CRC32}, {OPD(PF_38_66, 0xF6), 1, &OpDispatchBuilder::ADXOp}, {OPD(PF_38_F3, 0xF6), 1, &OpDispatchBuilder::ADXOp}, }; #undef OPD } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/H0F3ATables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { #define OPD(REX, prefix, opcode) ((REX << 9) | (prefix << 8) | opcode) #define PF_3A_NONE 0 #define PF_3A_66 1 constexpr auto OpDispatchTableGenH0F3A = []() consteval { constexpr auto OpDispatchTableGenH0F3AREX = []() consteval { constexpr DispatchTableEntry Table[] = { {OPD(REX, PF_3A_66, 0x08), 1, &OpDispatchBuilder::VectorRound}, {OPD(REX, PF_3A_66, 0x09), 1, &OpDispatchBuilder::VectorRound}, {OPD(REX, PF_3A_66, 0x0A), 1, &OpDispatchBuilder::InsertScalarRound}, {OPD(REX, PF_3A_66, 0x0B), 1, &OpDispatchBuilder::InsertScalarRound}, {OPD(REX, PF_3A_66, 0x0C), 1, &OpDispatchBuilder::VectorBlend}, {OPD(REX, PF_3A_66, 0x0D), 1, &OpDispatchBuilder::VectorBlend}, {OPD(REX, PF_3A_66, 0x0E), 1, &OpDispatchBuilder::VectorBlend}, {OPD(REX, PF_3A_NONE, 0x0F), 1, &OpDispatchBuilder::PAlignrOp}, {OPD(REX, PF_3A_66, 0x0F), 1, &OpDispatchBuilder::PAlignrOp}, {OPD(REX, PF_3A_66, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i8Bit>}, {OPD(REX, PF_3A_66, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i16Bit>}, {OPD(REX, PF_3A_66, 0x17), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i32Bit>}, {OPD(REX, PF_3A_66, 0x20), 1, &OpDispatchBuilder::PINSROp}, {OPD(REX, PF_3A_66, 0x21), 1, &OpDispatchBuilder::InsertPSOp}, {OPD(REX, PF_3A_66, 0x40), 1, &OpDispatchBuilder::DPPOp}, {OPD(REX, PF_3A_66, 0x41), 1, &OpDispatchBuilder::DPPOp}, {OPD(REX, PF_3A_66, 0x42), 1, &OpDispatchBuilder::MPSADBWOp}, {OPD(REX, PF_3A_66, 0x44), 1, &OpDispatchBuilder::PCLMULQDQOp}, {OPD(REX, PF_3A_66, 0x60), 1, &OpDispatchBuilder::VPCMPESTRMOp}, {OPD(REX, PF_3A_66, 0x61), 1, &OpDispatchBuilder::VPCMPESTRIOp}, {OPD(REX, PF_3A_66, 0x62), 1, &OpDispatchBuilder::VPCMPISTRMOp}, {OPD(REX, PF_3A_66, 0x63), 1, &OpDispatchBuilder::VPCMPISTRIOp}, {OPD(REX, PF_3A_NONE, 0xCC), 1, &OpDispatchBuilder::SHA1RNDS4Op}, {OPD(REX, PF_3A_66, 0xDF), 1, &OpDispatchBuilder::AESKeyGenAssist}, }; return std::to_array(Table); }; auto REX0 = OpDispatchTableGenH0F3AREX.template operator()<0>(); auto REX1 = OpDispatchTableGenH0F3AREX.template operator()<1>(); auto concat = [](const std::array& lhs, const std::array& rhs) consteval -> std::array { std::array Table {}; for (size_t i = 0; i < N1; ++i) { Table[i] = lhs[i]; } for (size_t i = 0; i < N2; ++i) { Table[N1 + i] = rhs[i]; } return Table; }; return concat(REX0, REX1); }; constexpr auto OpDispatch_H0F3ATableIgnoreREX = OpDispatchTableGenH0F3A(); constexpr DispatchTableEntry OpDispatch_H0F3ATableNeedsREX0[] = { {OPD(0, PF_3A_66, 0x16), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i32Bit>}, {OPD(0, PF_3A_66, 0x22), 1, &OpDispatchBuilder::PINSROp}, }; #undef PF_3A_NONE #undef PF_3A_66 #undef OPD } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/PrimaryGroupTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { using X86Tables::OpToIndex; #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_1) << 6) | (prefix) << 3 | (Reg)) constexpr DispatchTableEntry OpDispatch_PrimaryGroupTables[] = { // GROUP 1 {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 0), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 1), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ADCOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SBBOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 4), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 5), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 6), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::CMPOp, 1>}, // CMP {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 0), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 1), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ADCOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SBBOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 4), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 5), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 6), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::CMPOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 0), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 1), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ADCOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SBBOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 4), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 5), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 6), 1, &OpDispatchBuilder::SecondaryALUOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::CMPOp, 1>}, // GROUP 2 {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, true, true, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, false, true, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 2), 1, &OpDispatchBuilder::RCLOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 3), 1, &OpDispatchBuilder::RCROp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHRImmediateOp, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, false>}, // SAL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC0), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ASHROp, true, false>}, // SAR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, true, true, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, false, true, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 2), 1, &OpDispatchBuilder::RCLOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 3), 1, &OpDispatchBuilder::RCROp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHRImmediateOp, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, false>}, // SAL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xC1), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ASHROp, true, false>}, // SAR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, true, true, true>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, false, true, true>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 2), 1, &OpDispatchBuilder::RCLOp1Bit}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 3), 1, &OpDispatchBuilder::RCROp8x1Bit}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, true>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHRImmediateOp, true>}, // 1Bit SHR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, true>}, // SAL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD0), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ASHROp, true, true>}, // SAR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, true, true, true>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, false, true, true>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 2), 1, &OpDispatchBuilder::RCLOp1Bit}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 3), 1, &OpDispatchBuilder::RCROp1Bit}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, true>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHRImmediateOp, true>}, // 1Bit SHR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHLImmediateOp, true>}, // SAL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD1), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ASHROp, true, true>}, // SAR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, true, false, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, false, false, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 2), 1, &OpDispatchBuilder::RCLSmallerOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 3), 1, &OpDispatchBuilder::RCRSmallerOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 4), 1, &OpDispatchBuilder::SHLOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 5), 1, &OpDispatchBuilder::SHROp}, // SHR by CL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 6), 1, &OpDispatchBuilder::SHLOp}, // SAL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD2), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ASHROp, false, false>}, // SAR {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, true, false, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::RotateOp, false, false, false>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 2), 1, &OpDispatchBuilder::RCLOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 3), 1, &OpDispatchBuilder::RCROp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 4), 1, &OpDispatchBuilder::SHLOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 5), 1, &OpDispatchBuilder::SHROp}, // SHR by CL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 6), 1, &OpDispatchBuilder::SHLOp}, // SAL {OPD(FEXCore::X86Tables::TYPE_GROUP_2, OpToIndex(0xD3), 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::ASHROp, false, false>}, // SAR // GROUP 3 {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::TESTOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::TESTOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 2), 1, &OpDispatchBuilder::NOTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 3), 1, &OpDispatchBuilder::NEGOp}, // NEG {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 4), 1, &OpDispatchBuilder::MULOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 5), 1, &OpDispatchBuilder::IMULOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 6), 1, &OpDispatchBuilder::DIVOp}, // DIV {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF6), 7), 1, &OpDispatchBuilder::IDIVOp}, // IDIV {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::TESTOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::TESTOp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 2), 1, &OpDispatchBuilder::NOTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 3), 1, &OpDispatchBuilder::NEGOp}, // NEG {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 4), 1, &OpDispatchBuilder::MULOp}, // MUL {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 5), 1, &OpDispatchBuilder::IMULOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 6), 1, &OpDispatchBuilder::DIVOp}, // DIV {OPD(FEXCore::X86Tables::TYPE_GROUP_3, OpToIndex(0xF7), 7), 1, &OpDispatchBuilder::IDIVOp}, // IDIV // GROUP 4 {OPD(FEXCore::X86Tables::TYPE_GROUP_4, OpToIndex(0xFE), 0), 1, &OpDispatchBuilder::INCOp}, // INC {OPD(FEXCore::X86Tables::TYPE_GROUP_4, OpToIndex(0xFE), 1), 1, &OpDispatchBuilder::DECOp}, // DEC // GROUP 5 {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 0), 1, &OpDispatchBuilder::INCOp}, // INC {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 1), 1, &OpDispatchBuilder::DECOp}, // DEC {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 2), 1, &OpDispatchBuilder::CALLAbsoluteOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 3), 1, &OpDispatchBuilder::CALLFARIndirectOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 4), 1, &OpDispatchBuilder::JUMPAbsoluteOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 5), 1, &OpDispatchBuilder::JUMPFARIndirectOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_5, OpToIndex(0xFF), 6), 1, &OpDispatchBuilder::PUSHOp}, // GROUP 11 {OPD(FEXCore::X86Tables::TYPE_GROUP_11, OpToIndex(0xC6), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVGPROp, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_11, OpToIndex(0xC7), 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVGPROp, 1>}, }; #undef OPD } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryGroupTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_6) << 5) | (prefix) << 3 | (Reg)) constexpr uint16_t PF_NONE = 0; constexpr uint16_t PF_F3 = 1; constexpr uint16_t PF_66 = 2; constexpr uint16_t PF_F2 = 3; constexpr DispatchTableEntry OpDispatch_SecondaryGroupTables[] = { // GROUP 6 {OPD(FEXCore::X86Tables::TYPE_GROUP_6, PF_NONE, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_6, PF_F3, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_6, PF_66, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_6, PF_F2, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, // GROUP 7 {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_NONE, 0), 1, &OpDispatchBuilder::SGDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F3, 0), 1, &OpDispatchBuilder::SGDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_66, 0), 1, &OpDispatchBuilder::SGDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F2, 0), 1, &OpDispatchBuilder::SGDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_NONE, 1), 1, &OpDispatchBuilder::SIDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F3, 1), 1, &OpDispatchBuilder::SIDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_66, 1), 1, &OpDispatchBuilder::SIDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F2, 1), 1, &OpDispatchBuilder::SIDTOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_NONE, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F3, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_66, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F2, 3), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_NONE, 4), 1, &OpDispatchBuilder::SMSWOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F3, 4), 1, &OpDispatchBuilder::SMSWOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_66, 4), 1, &OpDispatchBuilder::SMSWOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F2, 4), 1, &OpDispatchBuilder::SMSWOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_NONE, 6), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F3, 6), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_66, 6), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F2, 6), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_NONE, 7), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F3, 7), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_66, 7), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_7, PF_F2, 7), 1, &OpDispatchBuilder::PermissionRestrictedOp}, // GROUP 8 {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_NONE, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTNone>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F3, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTNone>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_66, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTNone>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F2, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTNone>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_NONE, 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTSet>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F3, 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTSet>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_66, 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTSet>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F2, 5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTSet>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_NONE, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTClear>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F3, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTClear>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_66, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTClear>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F2, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTClear>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_NONE, 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTComplement>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F3, 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTComplement>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_66, 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTComplement>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_8, PF_F2, 7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 1, BTAction::BTComplement>}, // GROUP 9 {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_NONE, 1), 1, &OpDispatchBuilder::CMPXCHGPairOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_NONE, 6), 1, &OpDispatchBuilder::RDRANDOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_NONE, 7), 1, &OpDispatchBuilder::RDRANDOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_66, 1), 1, &OpDispatchBuilder::CMPXCHGPairOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_66, 6), 1, &OpDispatchBuilder::RDRANDOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_66, 7), 1, &OpDispatchBuilder::RDRANDOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_F2, 1), 1, &OpDispatchBuilder::CMPXCHGPairOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_F3, 1), 1, &OpDispatchBuilder::CMPXCHGPairOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_9, PF_F3, 7), 1, &OpDispatchBuilder::RDPIDOp}, // GROUP 12 {OPD(FEXCore::X86Tables::TYPE_GROUP_12, PF_NONE, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLI, OpSize::i16Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_12, PF_NONE, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAIOp, OpSize::i16Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_12, PF_NONE, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLLI, OpSize::i16Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_12, PF_66, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLI, OpSize::i16Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_12, PF_66, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAIOp, OpSize::i16Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_12, PF_66, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLLI, OpSize::i16Bit>}, // GROUP 13 {OPD(FEXCore::X86Tables::TYPE_GROUP_13, PF_NONE, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLI, OpSize::i32Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_13, PF_NONE, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAIOp, OpSize::i32Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_13, PF_NONE, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLLI, OpSize::i32Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_13, PF_66, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLI, OpSize::i32Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_13, PF_66, 4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAIOp, OpSize::i32Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_13, PF_66, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLLI, OpSize::i32Bit>}, // GROUP 14 {OPD(FEXCore::X86Tables::TYPE_GROUP_14, PF_NONE, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLI, OpSize::i64Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_14, PF_NONE, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLLI, OpSize::i64Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_14, PF_66, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLI, OpSize::i64Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_14, PF_66, 3), 1, &OpDispatchBuilder::PSRLDQ}, {OPD(FEXCore::X86Tables::TYPE_GROUP_14, PF_66, 6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLLI, OpSize::i64Bit>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_14, PF_66, 7), 1, &OpDispatchBuilder::PSLLDQ}, // GROUP 15 {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 0), 1, &OpDispatchBuilder::FXSaveOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 1), 1, &OpDispatchBuilder::FXRStoreOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 2), 1, &OpDispatchBuilder::LDMXCSR}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 3), 1, &OpDispatchBuilder::STMXCSR}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 4), 1, &OpDispatchBuilder::XSaveOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 5), 1, &OpDispatchBuilder::LoadFenceOrXRSTOR}, // LFENCE (or XRSTOR) {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 6), 1, &OpDispatchBuilder::MemFenceOrXSAVEOPT}, // MFENCE (or XSAVEOPT) {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_NONE, 7), 1, &OpDispatchBuilder::StoreFenceOrCLFlush}, // SFENCE (or CLFLUSH) {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_F3, 5), 1, &OpDispatchBuilder::UnimplementedOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_F3, 6), 1, &OpDispatchBuilder::UMonitorOrCLRSSBSY}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_66, 6), 1, &OpDispatchBuilder::CLWBOrTPause}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_66, 7), 1, &OpDispatchBuilder::CLFLUSHOPT}, {OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_F2, 6), 1, &OpDispatchBuilder::UMWaitOp}, // GROUP 16 {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, true, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 2>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 3>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 4), 4, &OpDispatchBuilder::NOPOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, true, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 2>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 3>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 4), 4, &OpDispatchBuilder::NOPOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, true, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 2>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 3>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 4), 4, &OpDispatchBuilder::NOPOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, true, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 2>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 3>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 4), 4, &OpDispatchBuilder::NOPOp}, // GROUP 17 {OPD(FEXCore::X86Tables::TYPE_GROUP_17, PF_66, 0), 1, &OpDispatchBuilder::Extrq_imm}, // GROUP P {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, false, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, true, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Prefetch, true, false, 1>}, {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 3), 5, &OpDispatchBuilder::NOPOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_F3, 0), 8, &OpDispatchBuilder::NOPOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_66, 0), 8, &OpDispatchBuilder::NOPOp}, {OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_F2, 0), 8, &OpDispatchBuilder::NOPOp}, }; #undef OPD } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryModRMTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { constexpr DispatchTableEntry OpDispatch_SecondaryModRMTables[] = { // REG /1 {((0 << 3) | 0), 1, &OpDispatchBuilder::UnimplementedOp}, {((0 << 3) | 1), 1, &OpDispatchBuilder::UnimplementedOp}, // REG /2 {((1 << 3) | 0), 1, &OpDispatchBuilder::XGetBVOp}, // REG /3 {((2 << 3) | 7), 1, &OpDispatchBuilder::PermissionRestrictedOp}, // REG /7 {((3 << 3) | 0), 1, &OpDispatchBuilder::PermissionRestrictedOp}, {((3 << 3) | 1), 1, &OpDispatchBuilder::RDTSCPOp}, {((3 << 3) | 4), 1, &OpDispatchBuilder::CLZeroOp}, }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { constexpr DispatchTableEntry OpDispatch_TwoByteOpTable[] = { // Instructions {0x03, 1, &OpDispatchBuilder::LSLOp}, {0x06, 1, &OpDispatchBuilder::PermissionRestrictedOp}, {0x07, 1, &OpDispatchBuilder::PermissionRestrictedOp}, {0x0B, 1, &OpDispatchBuilder::INTOp}, {0x0E, 1, &OpDispatchBuilder::X87EMMS}, {0x19, 7, &OpDispatchBuilder::NOPOp}, // NOP with ModRM {0x20, 4, &OpDispatchBuilder::PermissionRestrictedOp}, {0x30, 1, &OpDispatchBuilder::PermissionRestrictedOp}, {0x31, 1, &OpDispatchBuilder::RDTSCOp}, {0x32, 2, &OpDispatchBuilder::PermissionRestrictedOp}, {0x34, 3, &OpDispatchBuilder::UnimplementedOp}, {0x40, 16, &OpDispatchBuilder::CMOVOp}, {0x6E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::MMX>}, {0x6F, 1, &OpDispatchBuilder::MOVQMMXOp}, {0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::MMX>}, {0x7F, 1, &OpDispatchBuilder::MOVQMMXOp}, {0x80, 16, &OpDispatchBuilder::CondJUMPOp}, {0x90, 16, &OpDispatchBuilder::SETccOp}, {0xA2, 1, &OpDispatchBuilder::CPUIDOp}, {0xA3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 0, BTAction::BTNone>}, // BT {0xA4, 1, &OpDispatchBuilder::SHLDImmediateOp}, {0xA5, 1, &OpDispatchBuilder::SHLDOp}, {0xAB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 0, BTAction::BTSet>}, // BTS {0xAC, 1, &OpDispatchBuilder::SHRDImmediateOp}, {0xAD, 1, &OpDispatchBuilder::SHRDOp}, {0xAF, 1, &OpDispatchBuilder::IMUL1SrcOp}, {0xB0, 2, &OpDispatchBuilder::CMPXCHGOp}, // CMPXCHG {0xB3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 0, BTAction::BTClear>}, // BTR {0xB6, 2, &OpDispatchBuilder::MOVZXOp}, {0xBB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::BTOp, 0, BTAction::BTComplement>}, // BTC {0xBC, 1, &OpDispatchBuilder::BSFOp}, // BSF {0xBD, 1, &OpDispatchBuilder::BSROp}, // BSF {0xBE, 2, &OpDispatchBuilder::MOVSXOp}, {0xC0, 2, &OpDispatchBuilder::XADDOp}, {0xC3, 1, &OpDispatchBuilder::MOVGPRNTOp}, {0xC4, 1, &OpDispatchBuilder::PINSROp}, {0xC5, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i16Bit>}, {0xC8, 8, &OpDispatchBuilder::BSWAPOp}, // SSE {0x10, 2, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0x12, 2, &OpDispatchBuilder::MOVLPOp}, {0x14, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i32Bit>}, {0x15, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i32Bit>}, {0x16, 2, &OpDispatchBuilder::MOVHPDOp}, {0x28, 2, &OpDispatchBuilder::MOVVectorAlignedOp}, {0x2A, 1, &OpDispatchBuilder::InsertMMX_To_XMM_Vector_CVT_Int_To_Float}, {0x2B, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0x2C, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, {0x2D, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, {0x2E, 2, &OpDispatchBuilder::UCOMISxOp}, {0x50, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVMSKOp, OpSize::i32Bit>}, {0x51, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFSQRT, OpSize::i32Bit>}, {0x52, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>}, {0x53, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>}, {0x54, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VAND, OpSize::i128Bit>}, {0x55, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUROp, IR::OP_VANDN, OpSize::i64Bit>}, {0x56, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VOR, OpSize::i128Bit>}, {0x57, 1, &OpDispatchBuilder::VectorXOROp}, {0x58, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i32Bit>}, {0x59, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMUL, OpSize::i32Bit>}, {0x5A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Vector_CVT_Float_To_Float, OpSize::i64Bit, OpSize::i32Bit, false>}, {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, {0x5C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>}, {0x5D, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>}, {0x5E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFDIV, OpSize::i32Bit>}, {0x5F, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMAX, OpSize::i32Bit>}, {0x60, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i8Bit>}, {0x61, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i16Bit>}, {0x62, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i32Bit>}, {0x63, 1, &OpDispatchBuilder::PACKSSOp}, {0x64, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i8Bit>}, {0x65, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i16Bit>}, {0x66, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i32Bit>}, {0x67, 1, &OpDispatchBuilder::PACKUSOp}, {0x68, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i8Bit>}, {0x69, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i16Bit>}, {0x6A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i32Bit>}, {0x6B, 1, &OpDispatchBuilder::PACKSSOp}, {0x70, 1, &OpDispatchBuilder::PSHUFW8ByteOp}, {0x74, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i8Bit>}, {0x75, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i16Bit>}, {0x76, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i32Bit>}, {0x77, 1, &OpDispatchBuilder::X87EMMS}, {0xC2, 1, &OpDispatchBuilder::VFCMPOp}, {0xC6, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHUFOp, OpSize::i32Bit>}, {0xD1, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLDOp, OpSize::i16Bit>}, {0xD2, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLDOp, OpSize::i32Bit>}, {0xD3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLDOp, OpSize::i64Bit>}, {0xD4, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i64Bit>}, {0xD5, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VMUL, OpSize::i16Bit>}, {0xD7, 1, &OpDispatchBuilder::MOVMSKOpOne}, // PMOVMSKB {0xD8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, OpSize::i8Bit>}, {0xD9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, OpSize::i16Bit>}, {0xDA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMIN, OpSize::i8Bit>}, {0xDB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VAND, OpSize::i64Bit>}, {0xDC, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQADD, OpSize::i8Bit>}, {0xDD, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQADD, OpSize::i16Bit>}, {0xDE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMAX, OpSize::i8Bit>}, {0xDF, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUROp, IR::OP_VANDN, OpSize::i64Bit>}, {0xE0, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VURAVG, OpSize::i8Bit>}, {0xE1, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAOp, OpSize::i16Bit>}, {0xE2, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAOp, OpSize::i32Bit>}, {0xE3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VURAVG, OpSize::i16Bit>}, {0xE4, 1, &OpDispatchBuilder::PMULHW}, {0xE5, 1, &OpDispatchBuilder::PMULHW}, {0xE7, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0xE8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQSUB, OpSize::i8Bit>}, {0xE9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQSUB, OpSize::i16Bit>}, {0xEA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMIN, OpSize::i16Bit>}, {0xEB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VOR, OpSize::i64Bit>}, {0xEC, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQADD, OpSize::i8Bit>}, {0xED, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQADD, OpSize::i16Bit>}, {0xEE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMAX, OpSize::i16Bit>}, {0xEF, 1, &OpDispatchBuilder::VectorXOROp}, {0xF1, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLL, OpSize::i16Bit>}, {0xF2, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLL, OpSize::i32Bit>}, {0xF3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLL, OpSize::i64Bit>}, {0xF4, 1, &OpDispatchBuilder::PMULLOp}, {0xF5, 1, &OpDispatchBuilder::PMADDWD}, {0xF6, 1, &OpDispatchBuilder::PSADBW}, {0xF7, 1, &OpDispatchBuilder::MASKMOVOp}, {0xF8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i8Bit>}, {0xF9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i16Bit>}, {0xFA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i32Bit>}, {0xFB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i64Bit>}, {0xFC, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i8Bit>}, {0xFD, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i16Bit>}, {0xFE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i32Bit>}, #ifndef _WIN32 // FEX reserved instructions {0x3E, 1, &OpDispatchBuilder::CallbackReturnOp}, {0x3F, 1, &OpDispatchBuilder::ThunkOp}, #endif }; constexpr DispatchTableEntry OpDispatch_SecondaryRepModTables[] = { {0x10, 2, &OpDispatchBuilder::MOVSSOp}, {0x12, 1, &OpDispatchBuilder::VMOVSLDUPOp}, {0x16, 1, &OpDispatchBuilder::VMOVSHDUPOp}, {0x2A, 1, &OpDispatchBuilder::InsertCVTGPR_To_FPR}, {0x2B, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0x2C, 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {0x2D, 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {0x51, 1, &OpDispatchBuilder::VectorScalarUnaryInsertALUOp}, {0x52, 1, &OpDispatchBuilder::VectorScalarUnaryInsertALUOp}, {0x53, 1, &OpDispatchBuilder::VectorScalarUnaryInsertALUOp}, {0x58, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x59, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5A, 1, &OpDispatchBuilder::InsertScalar_CVT_Float_To_Float}, {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0x5C, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5D, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5E, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5F, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x6F, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0x70, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSHUFWOp, false>}, {0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::SSE>}, {0x7F, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0xB8, 1, &OpDispatchBuilder::PopcountOp}, {0xBC, 1, &OpDispatchBuilder::TZCNT}, {0xBD, 1, &OpDispatchBuilder::LZCNT}, {0xC2, 1, &OpDispatchBuilder::InsertScalarFCMPOp}, {0xD6, 1, &OpDispatchBuilder::MOVQ2DQ}, {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, }; constexpr DispatchTableEntry OpDispatch_SecondaryRepNEModTables[] = { {0x10, 2, &OpDispatchBuilder::MOVSDOp}, {0x12, 1, &OpDispatchBuilder::MOVDDUPOp}, {0x2A, 1, &OpDispatchBuilder::InsertCVTGPR_To_FPR}, {0x2B, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0x2C, 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {0x2D, 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {0x51, 1, &OpDispatchBuilder::VectorScalarUnaryInsertALUOp}, // x52 = Invalid {0x58, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x59, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5A, 1, &OpDispatchBuilder::InsertScalar_CVT_Float_To_Float}, {0x5C, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5D, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5E, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5F, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x70, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSHUFWOp, true>}, {0x78, 1, &OpDispatchBuilder::Insertq_imm}, {0x79, 1, &OpDispatchBuilder::Insertq}, {0x7C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADDP, OpSize::i32Bit>}, {0x7D, 1, &OpDispatchBuilder::HSUBP}, {0xD0, 1, &OpDispatchBuilder::ADDSUBPOp}, {0xD6, 1, &OpDispatchBuilder::MOVQ2DQ}, {0xC2, 1, &OpDispatchBuilder::InsertScalarFCMPOp}, {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0xF0, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, }; constexpr DispatchTableEntry OpDispatch_SecondaryOpSizeModTables[] = { {0x10, 2, &OpDispatchBuilder::MOVVectorUnalignedOp}, {0x12, 2, &OpDispatchBuilder::MOVLPOp}, {0x14, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i64Bit>}, {0x15, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i64Bit>}, {0x16, 2, &OpDispatchBuilder::MOVHPDOp}, {0x28, 2, &OpDispatchBuilder::MOVVectorAlignedOp}, {0x2A, 1, &OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float}, {0x2B, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0x2C, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, {0x2D, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, {0x2E, 2, &OpDispatchBuilder::UCOMISxOp}, {0x50, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVMSKOp, OpSize::i64Bit>}, {0x51, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFSQRT, OpSize::i64Bit>}, {0x54, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VAND, OpSize::i128Bit>}, {0x55, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUROp, IR::OP_VANDN, OpSize::i64Bit>}, {0x56, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VOR, OpSize::i128Bit>}, {0x57, 1, &OpDispatchBuilder::VectorXOROp}, {0x58, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i64Bit>}, {0x59, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMUL, OpSize::i64Bit>}, {0x5A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Vector_CVT_Float_To_Float, OpSize::i32Bit, OpSize::i64Bit, false>}, {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0x5C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i64Bit>}, {0x5D, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i64Bit>}, {0x5E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFDIV, OpSize::i64Bit>}, {0x5F, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMAX, OpSize::i64Bit>}, {0x60, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i8Bit>}, {0x61, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i16Bit>}, {0x62, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i32Bit>}, {0x63, 1, &OpDispatchBuilder::PACKSSOp}, {0x64, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i8Bit>}, {0x65, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i16Bit>}, {0x66, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPGT, OpSize::i32Bit>}, {0x67, 1, &OpDispatchBuilder::PACKUSOp}, {0x68, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i8Bit>}, {0x69, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i16Bit>}, {0x6A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i32Bit>}, {0x6B, 1, &OpDispatchBuilder::PACKSSOp}, {0x6C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKLOp, OpSize::i64Bit>}, {0x6D, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PUNPCKHOp, OpSize::i64Bit>}, {0x6E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::SSE>}, {0x6F, 1, &OpDispatchBuilder::MOVVectorAlignedOp}, {0x70, 1, &OpDispatchBuilder::PSHUFDOp}, {0x74, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i8Bit>}, {0x75, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i16Bit>}, {0x76, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VCMPEQ, OpSize::i32Bit>}, {0x78, 1, nullptr}, // GROUP 17 {0x79, 1, &OpDispatchBuilder::Extrq}, {0x7C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADDP, OpSize::i64Bit>}, {0x7D, 1, &OpDispatchBuilder::HSUBP}, {0x7E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::SSE>}, {0x7F, 1, &OpDispatchBuilder::MOVVectorAlignedOp}, {0xC2, 1, &OpDispatchBuilder::VFCMPOp}, {0xC4, 1, &OpDispatchBuilder::PINSROp}, {0xC5, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i16Bit>}, {0xC6, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::SHUFOp, OpSize::i64Bit>}, {0xD0, 1, &OpDispatchBuilder::ADDSUBPOp}, {0xD1, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLDOp, OpSize::i16Bit>}, {0xD2, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLDOp, OpSize::i32Bit>}, {0xD3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRLDOp, OpSize::i64Bit>}, {0xD4, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i64Bit>}, {0xD5, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VMUL, OpSize::i16Bit>}, {0xD6, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::SSE>}, {0xD7, 1, &OpDispatchBuilder::MOVMSKOpOne}, // PMOVMSKB {0xD8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, OpSize::i8Bit>}, {0xD9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQSUB, OpSize::i16Bit>}, {0xDA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMIN, OpSize::i8Bit>}, {0xDB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VAND, OpSize::i128Bit>}, {0xDC, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQADD, OpSize::i8Bit>}, {0xDD, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUQADD, OpSize::i16Bit>}, {0xDE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VUMAX, OpSize::i8Bit>}, {0xDF, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUROp, IR::OP_VANDN, OpSize::i64Bit>}, {0xE0, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VURAVG, OpSize::i8Bit>}, {0xE1, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAOp, OpSize::i16Bit>}, {0xE2, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSRAOp, OpSize::i32Bit>}, {0xE3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VURAVG, OpSize::i16Bit>}, {0xE4, 1, &OpDispatchBuilder::PMULHW}, {0xE5, 1, &OpDispatchBuilder::PMULHW}, {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0xE7, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0xE8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQSUB, OpSize::i8Bit>}, {0xE9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQSUB, OpSize::i16Bit>}, {0xEA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMIN, OpSize::i16Bit>}, {0xEB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VOR, OpSize::i128Bit>}, {0xEC, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQADD, OpSize::i8Bit>}, {0xED, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQADD, OpSize::i16Bit>}, {0xEE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSMAX, OpSize::i16Bit>}, {0xEF, 1, &OpDispatchBuilder::VectorXOROp}, {0xF1, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLL, OpSize::i16Bit>}, {0xF2, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLL, OpSize::i32Bit>}, {0xF3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PSLL, OpSize::i64Bit>}, {0xF4, 1, &OpDispatchBuilder::PMULLOp}, {0xF5, 1, &OpDispatchBuilder::PMADDWD}, {0xF6, 1, &OpDispatchBuilder::PSADBW}, {0xF7, 1, &OpDispatchBuilder::MASKMOVOp}, {0xF8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i8Bit>}, {0xF9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i16Bit>}, {0xFA, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i32Bit>}, {0xFB, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSUB, OpSize::i64Bit>}, {0xFC, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i8Bit>}, {0xFD, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i16Bit>}, {0xFE, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VADD, OpSize::i32Bit>}, }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/VEXTables.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/OpcodeDispatcher.h" namespace FEXCore::IR { #define OPD(map_select, pp, opcode) (((map_select - 1) << 10) | (pp << 8) | (opcode)) constexpr DispatchTableEntry OpDispatch_VEXTable[] = { {OPD(2, 0b00, 0xF2), 1, &OpDispatchBuilder::ANDNBMIOp}, {OPD(2, 0b00, 0xF5), 1, &OpDispatchBuilder::BZHI}, {OPD(2, 0b10, 0xF5), 1, &OpDispatchBuilder::PEXT}, {OPD(2, 0b11, 0xF5), 1, &OpDispatchBuilder::PDEP}, {OPD(2, 0b11, 0xF6), 1, &OpDispatchBuilder::MULX}, {OPD(2, 0b00, 0xF7), 1, &OpDispatchBuilder::BEXTRBMIOp}, {OPD(2, 0b01, 0xF7), 1, &OpDispatchBuilder::BMI2Shift}, {OPD(2, 0b10, 0xF7), 1, &OpDispatchBuilder::BMI2Shift}, {OPD(2, 0b11, 0xF7), 1, &OpDispatchBuilder::BMI2Shift}, {OPD(3, 0b11, 0xF0), 1, &OpDispatchBuilder::RORX}, }; #undef OPD #define OPD(group, pp, opcode) (((group - X86Tables::InstType::TYPE_VEX_GROUP_12) << 4) | (pp << 3) | (opcode)) constexpr DispatchTableEntry OpDispatch_VEXGroupTable[] = { {OPD(X86Tables::InstType::TYPE_VEX_GROUP_17, 0, 0b001), 1, &OpDispatchBuilder::BLSRBMIOp}, {OPD(X86Tables::InstType::TYPE_VEX_GROUP_17, 0, 0b010), 1, &OpDispatchBuilder::BLSMSKBMIOp}, {OPD(X86Tables::InstType::TYPE_VEX_GROUP_17, 0, 0b011), 1, &OpDispatchBuilder::BLSIBMIOp}, }; #undef OPD } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 Vector instructions to IR $end_info$ */ #include "Interface/Context/Context.h" #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/IR/IR.h" #include #include #include #include #include #include #include #include namespace FEXCore::IR { #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op void OpDispatchBuilder::MOVVectorAlignedOp(OpcodeArgs) { if (Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { // Nop return; } Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); StoreResultFPR(Op, Src); } void OpDispatchBuilder::MOVVectorUnalignedOp(OpcodeArgs) { if (Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { // Nop return; } Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit}); StoreResultFPR(Op, Src, OpSize::i8Bit); } void OpDispatchBuilder::MOVVectorNTOp(OpcodeArgs) { const auto Size = OpSizeFromDst(Op); if (Op->Dest.IsGPR() && Size >= OpSize::i128Bit) { ///< MOVNTDQA load non-temporal comes from SSE4.1 and is extended by AVX/AVX2. Ref SrcAddr = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.LoadData = false}); auto Src = _VLoadNonTemporal(Size, SrcAddr, 0); StoreResultFPR(Op, Src, OpSize::i8Bit, MemoryAccessType::STREAM); } else if (Op->Dest.IsGPR()) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit, .AccessType = MemoryAccessType::STREAM}); StoreResultFPR(Op, Src, OpSize::i8Bit, MemoryAccessType::STREAM); } else { LOGMAN_THROW_A_FMT(!Op->Dest.IsGPR(), "Destination can't be GPR for non-temporal stores"); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit, .AccessType = MemoryAccessType::STREAM}); if (Size < OpSize::i128Bit) { // Normal streaming store if less than 128-bit // XMM Scalar 32-bit and 64-bit comes from SSE4a MOVNTSS, MOVNTSD // MMX 64-bit comes from MOVNTQ StoreResultFPR(Op, Src, OpSize::i8Bit, MemoryAccessType::STREAM); } else { Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); // Single store non-temporal for larger operations. _VStoreNonTemporal(Size, Src, Dest, 0); } } } void OpDispatchBuilder::VMOVAPS_VMOVAPDOp(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); if (Is128Bit && Op->Dest.IsGPR()) { Src = VZeroExtendOperand(OpSize::i128Bit, Op->Src[0], Src); } StoreResultFPR(Op, Src); } void OpDispatchBuilder::VMOVUPS_VMOVUPDOp(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit}); if (Is128Bit && Op->Dest.IsGPR()) { Src = VZeroExtendOperand(OpSize::i128Bit, Op->Src[0], Src); } StoreResultFPR(Op, Src, OpSize::i8Bit); } void OpDispatchBuilder::MOVHPDOp(OpcodeArgs) { if (Op->Dest.IsGPR()) { if (Op->Src[0].IsGPR()) { // MOVLHPS between two vector registers. Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, OpSize::i128Bit, Op->Flags); auto Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 0, Dest, Src); StoreResultFPR(Op, Result); } else { // If the destination is a GPR then the source is memory // xmm1[127:64] = src Ref Src = MakeSegmentAddress(Op, Op->Src[0]); Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, OpSize::i128Bit, Op->Flags); auto Result = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Dest, 1, Src); StoreResultFPR(Op, Result); } } else { // In this case memory is the destination and the high bits of the XMM are source // Mem64 = xmm1[127:64] Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(OpSize::i128Bit, OpSize::i64Bit, Src, 1, Dest); } } void OpDispatchBuilder::VMOVHPOp(OpcodeArgs) { if (Op->Dest.IsGPR()) { Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i128Bit}); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags, {.Align = OpSize::i64Bit}); Ref Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 0, Src1, Src2); StoreResultFPR(Op, Result); } else { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i128Bit}); Ref Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 1, Src, Src); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, OpSize::i64Bit, OpSize::i64Bit); } } void OpDispatchBuilder::MOVLPOp(OpcodeArgs) { if (Op->Dest.IsGPR()) { // xmm, xmm is movhlps special case if (Op->Src[0].IsGPR()) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i128Bit}); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags, {.Align = OpSize::i128Bit}); auto Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 1, Dest, Src); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, OpSize::i128Bit, OpSize::i128Bit); } else { const auto DstSize = OpSizeFromDst(Op); Ref Src = MakeSegmentAddress(Op, Op->Src[0]); Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, DstSize, Op->Flags); auto Result = _VLoadVectorElement(OpSize::i128Bit, OpSize::i64Bit, Dest, 0, Src); StoreResultFPR(Op, Result); } } else { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i64Bit}); StoreResultFPR_WithOpSize(Op, Op->Dest, Src, OpSize::i64Bit, OpSize::i64Bit); } } void OpDispatchBuilder::VMOVLPOp(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i128Bit}); if (!Op->Dest.IsGPR()) { ///< VMOVLPS/PD mem64, xmm1 StoreResultFPR_WithOpSize(Op, Op->Dest, Src1, OpSize::i64Bit, OpSize::i64Bit); } else if (!Op->Src[1].IsGPR()) { ///< VMOVLPS/PD xmm1, xmm2, mem64 // Bits[63:0] come from Src2[63:0] // Bits[127:64] come from Src1[127:64] Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags, {.Align = OpSize::i64Bit}); Ref Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 1, Src2, Src1); StoreResultFPR(Op, Result); } else { ///< VMOVHLPS/PD xmm1, xmm2, xmm3 Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags, {.Align = OpSize::i128Bit}); Ref Result = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 1, Src1, Src2); StoreResultFPR(Op, Result); } } void OpDispatchBuilder::VMOVSHDUPOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VTrn2(SrcSize, OpSize::i32Bit, Src, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VMOVSLDUPOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VTrn(SrcSize, OpSize::i32Bit, Src, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::MOVScalarOpImpl(OpcodeArgs, IR::OpSize ElementSize) { if (Op->Dest.IsGPR() && Op->Src[0].IsGPR()) { // MOVSS/SD xmm1, xmm2 Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto Result = _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Dest, Src); StoreResultFPR(Op, Result); } else if (Op->Dest.IsGPR()) { // MOVSS/SD xmm1, mem32/mem64 // xmm1[127:0] <- zext(mem32/mem64) Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], ElementSize, Op->Flags); StoreResultFPR(Op, Src); } else { // MOVSS/SD mem32/mem64, xmm1 Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); StoreResultFPR_WithOpSize(Op, Op->Dest, Src, ElementSize); } } void OpDispatchBuilder::MOVSSOp(OpcodeArgs) { MOVScalarOpImpl(Op, OpSize::i32Bit); } void OpDispatchBuilder::MOVSDOp(OpcodeArgs) { MOVScalarOpImpl(Op, OpSize::i64Bit); } void OpDispatchBuilder::VMOVScalarOpImpl(OpcodeArgs, IR::OpSize ElementSize) { if (Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Src[1].IsGPR()) { // VMOVSS/SD xmm1, xmm2, xmm3 Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Src1, Src2); StoreResultFPR(Op, Result); } else if (Op->Dest.IsGPR()) { // VMOVSS/SD xmm1, mem32/mem64 Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[1], ElementSize, Op->Flags); StoreResultFPR(Op, Src); } else { // VMOVSS/SD mem32/mem64, xmm1 Ref Src = LoadSourceFPR(Op, Op->Src[1], Op->Flags); StoreResultFPR_WithOpSize(Op, Op->Dest, Src, ElementSize); } } void OpDispatchBuilder::VMOVSDOp(OpcodeArgs) { VMOVScalarOpImpl(Op, OpSize::i64Bit); } void OpDispatchBuilder::VMOVSSOp(OpcodeArgs) { VMOVScalarOpImpl(Op, OpSize::i32Bit); } void OpDispatchBuilder::VectorALUOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); DeriveOp(ALUOp, IROp, _VAdd(Size, ElementSize, Dest, Src)); StoreResultFPR(Op, ALUOp); } void OpDispatchBuilder::VectorXOROp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); // Special case for vector xor with itself being the optimal way for x86 to zero vector registers. if (Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { const auto ZeroRegister = LoadZeroVector(Size); StoreResultFPR(Op, ZeroRegister); return; } ///< Regular code path VectorALUOp(Op, OP_VXOR, Size); } void OpDispatchBuilder::AVXVectorALUOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); DeriveOp(ALUOp, IROp, _VAdd(Size, ElementSize, Src1, Src2)); StoreResultFPR(Op, ALUOp); } void OpDispatchBuilder::AVXVectorXOROp(OpcodeArgs) { // Special case for vector xor with itself being the optimal way for x86 to zero vector registers. if (Op->Src[0].IsGPR() && Op->Src[1].IsGPR() && Op->Src[0].Data.GPR.GPR == Op->Src[1].Data.GPR.GPR) { const auto DstSize = OpSizeFromDst(Op); const auto ZeroRegister = LoadZeroVector(DstSize); StoreResultFPR(Op, ZeroRegister); return; } ///< Regular code path AVXVectorALUOp(Op, OP_VXOR, OpSize::i128Bit); } void OpDispatchBuilder::VectorALUROp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); DeriveOp(ALUOp, IROp, _VAdd(Size, ElementSize, Src, Dest)); StoreResultFPR(Op, ALUOp); } Ref OpDispatchBuilder::VectorScalarInsertALUOpImpl(OpcodeArgs, IROps IROp, IR::OpSize DstSize, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); // If OpSize == ElementSize then it only does the lower scalar op DeriveOp(ALUOp, IROp, _VFAddScalarInsert(DstSize, ElementSize, Src1, Src2, ZeroUpperBits)); return ALUOp; } template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); auto Result = VectorScalarInsertALUOpImpl(Op, IROp, DstSize, ElementSize, Op->Dest, Op->Src[0], false); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); auto Result = VectorScalarInsertALUOpImpl(Op, IROp, DstSize, ElementSize, Op->Src[0], Op->Src[1], true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarInsertALUOp(OpcodeArgs); Ref OpDispatchBuilder::VectorScalarUnaryInsertALUOpImpl(OpcodeArgs, IROps IROp, IR::OpSize DstSize, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); // If OpSize == ElementSize then it only does the lower scalar op DeriveOp(ALUOp, IROp, _VFSqrtScalarInsert(DstSize, ElementSize, Src1, Src2, ZeroUpperBits)); return ALUOp; } template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); auto Result = VectorScalarInsertALUOpImpl(Op, IROp, DstSize, ElementSize, Op->Dest, Op->Src[0], false); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::VectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); auto Result = VectorScalarInsertALUOpImpl(Op, IROp, DstSize, ElementSize, Op->Src[0], Op->Src[1], true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); template void OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); void OpDispatchBuilder::InsertMMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto DstSize = GetGuestVectorLength(); const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i64Bit : OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, DstSize, Op->Flags); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); // Always 32-bit. const auto ElementSize = OpSize::i32Bit; // Always signed Dest = _VSToFVectorInsert(DstSize, ElementSize, ElementSize, Dest, Src, true, false); StoreResultFPR_WithOpSize(Op, Op->Dest, Dest, DstSize); } Ref OpDispatchBuilder::InsertCVTGPR_To_FPRImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize DstElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, DstSize, Op->Flags); if (Src2Op.IsGPR()) { // If the source is a GPR then convert directly from the GPR. auto Src2 = LoadSourceGPR_WithOpSize(Op, Src2Op, GetGPROpSize(), Op->Flags); return _VSToFGPRInsert(DstSize, DstElementSize, SrcSize, Src1, Src2, ZeroUpperBits); } else if (SrcSize != DstElementSize) { // If the source is from memory but the Source size and destination size aren't the same, // then it is more optimal to load in to a GPR and convert between GPR->FPR. // ARM GPR->FPR conversion supports different size source and destinations while FPR->FPR doesn't. auto Src2 = LoadSourceGPR(Op, Src2Op, Op->Flags); return _VSToFGPRInsert(DstSize, DstElementSize, SrcSize, Src1, Src2, ZeroUpperBits); } // In the case of cvtsi2s{s,d} where the source and destination are the same size, // then it is more optimal to load in to the FPR register directly and convert there. auto Src2 = LoadSourceFPR(Op, Src2Op, Op->Flags); // Always signed return _VSToFVectorInsert(DstSize, DstElementSize, DstElementSize, Src1, Src2, false, ZeroUpperBits); } template void OpDispatchBuilder::InsertCVTGPR_To_FPR(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); auto Result = InsertCVTGPR_To_FPRImpl(Op, DstSize, DstElementSize, Op->Dest, Op->Src[0], false); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::InsertCVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::InsertCVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::AVXInsertCVTGPR_To_FPR(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); Ref Result = InsertCVTGPR_To_FPRImpl(Op, DstSize, DstElementSize, Op->Src[0], Op->Src[1], true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::AVXInsertCVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::AVXInsertCVTGPR_To_FPR(OpcodeArgs); Ref OpDispatchBuilder::InsertScalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = Src2Op.IsGPR() ? OpSize::i128Bit : SrcElementSize; Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); return _VFToFScalarInsert(DstSize, DstElementSize, SrcElementSize, Src1, Src2, ZeroUpperBits); } template void OpDispatchBuilder::InsertScalar_CVT_Float_To_Float(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); Ref Result = InsertScalar_CVT_Float_To_FloatImpl(Op, DstSize, DstElementSize, SrcElementSize, Op->Dest, Op->Src[0], false); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::InsertScalar_CVT_Float_To_Float(OpcodeArgs); template void OpDispatchBuilder::InsertScalar_CVT_Float_To_Float(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float(OpcodeArgs) { const auto DstSize = GetGuestVectorLength(); Ref Result = InsertScalar_CVT_Float_To_FloatImpl(Op, DstSize, DstElementSize, SrcElementSize, Op->Src[0], Op->Src[1], true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float(OpcodeArgs); RoundMode OpDispatchBuilder::TranslateRoundType(uint8_t Mode) { const uint64_t RoundControlSource = (Mode >> 2) & 1; uint64_t RoundControl = Mode & 0b11; static constexpr std::array SourceModes = { RoundMode::Nearest, RoundMode::NegInfinity, RoundMode::PosInfinity, RoundMode::TowardsZero, }; return RoundControlSource ? RoundMode::Host : SourceModes[RoundControl]; } Ref OpDispatchBuilder::InsertScalarRoundImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, uint64_t Mode, bool ZeroUpperBits) { // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Src2Op, SrcSize, Op->Flags, {.AllowUpperGarbage = true}); const auto SourceMode = TranslateRoundType(Mode); auto ALUOp = _VFToIScalarInsert(DstSize, ElementSize, Src1, Src2, SourceMode, ZeroUpperBits); return ALUOp; } template void OpDispatchBuilder::InsertScalarRound(OpcodeArgs) { const uint64_t Mode = Op->Src[1].Literal(); const auto DstSize = GetGuestVectorLength(); Ref Result = InsertScalarRoundImpl(Op, DstSize, ElementSize, Op->Dest, Op->Src[0], Mode, false); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::InsertScalarRound(OpcodeArgs); template void OpDispatchBuilder::InsertScalarRound(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalarRound(OpcodeArgs) { const uint64_t Mode = Op->Src[2].Literal(); const auto DstSize = GetGuestVectorLength(); Ref Result = InsertScalarRoundImpl(Op, DstSize, ElementSize, Op->Src[0], Op->Src[1], Mode, true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::AVXInsertScalarRound(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalarRound(OpcodeArgs); Ref OpDispatchBuilder::InsertScalarFCMPOpImpl(OpSize Size, IR::OpSize OpDstSize, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t CompType, bool ZeroUpperBits) { switch (static_cast(CompType)) { case VectorCompareType::EQ_OQ: case VectorCompareType::EQ_OS: return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::EQ, ZeroUpperBits); case VectorCompareType::LT_OS: // GT(Swapped operand) case VectorCompareType::LT_OQ: return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::LT, ZeroUpperBits); case VectorCompareType::LE_OS: // GE(Swapped operand) case VectorCompareType::LE_OQ: return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::LE, ZeroUpperBits); case VectorCompareType::UNORD_Q: case VectorCompareType::UNORD_S: return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::UNO, ZeroUpperBits); case VectorCompareType::NEQ_UQ: case VectorCompareType::NEQ_US: return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::NEQ, ZeroUpperBits); case VectorCompareType::NLT_US: // NGT(Swapped operand) case VectorCompareType::NLT_UQ: { Ref Result = _VFCMPLT(ElementSize, ElementSize, Src1, Src2); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::NLE_US: // NGE(Swapped operand) case VectorCompareType::NLE_UQ: { Ref Result = _VFCMPLE(ElementSize, ElementSize, Src1, Src2); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::ORD_Q: case VectorCompareType::ORD_S: return _VFCMPScalarInsert(Size, ElementSize, Src1, Src2, FloatCompareOp::ORD, ZeroUpperBits); case VectorCompareType::NGT_UQ: case VectorCompareType::NGT_US: { Ref Result = _VFCMPLT(ElementSize, ElementSize, Src2, Src1); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::NGE_UQ: case VectorCompareType::NGE_US: { Ref Result = _VFCMPLE(ElementSize, ElementSize, Src2, Src1); Result = _VNot(ElementSize, ElementSize, Result); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::GT_OQ: case VectorCompareType::GT_OS: { Ref Result = _VFCMPLT(ElementSize, ElementSize, Src2, Src1); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::GE_OQ: case VectorCompareType::GE_OS: { Ref Result = _VFCMPLE(ElementSize, ElementSize, Src2, Src1); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::EQ_UQ: case VectorCompareType::EQ_US: { // If either of the sources are unordered, then returns true. Ref Src1_U = _VFCMPEQ(Size, ElementSize, Src1, Src1); Ref Src2_U = _VFCMPEQ(Size, ElementSize, Src2, Src2); auto Ordered = _VAnd(Size, ElementSize, Src1_U, Src2_U); Ref Compare_Ordered = _VFCMPEQ(Size, ElementSize, Src1, Src2); Ref Result = _VOrn(Size, ElementSize, Compare_Ordered, Ordered); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::NEQ_OQ: case VectorCompareType::NEQ_OS: { // If either of the sources are unordered, then returns false. Ref Src1_U = _VFCMPEQ(Size, ElementSize, Src1, Src1); Ref Src2_U = _VFCMPEQ(Size, ElementSize, Src2, Src2); Ref Compare_Ordered = _VFCMPEQ(Size, ElementSize, Src1, Src2); Ref Result = _VAndn(Size, ElementSize, Src1_U, Compare_Ordered); Result = _VAnd(Size, ElementSize, Result, Src2_U); // Insert the lower bits return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, Result); } case VectorCompareType::FALSE_OQ: case VectorCompareType::FALSE_OS: return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, LoadZeroVector(OpSize::i128Bit)); case VectorCompareType::TRUE_UQ: case VectorCompareType::TRUE_US: return _VInsElement(OpDstSize, ElementSize, 0, 0, Src1, _VectorImm(OpSize::i128Bit, OpSize::i8Bit, -1, 0)); } FEX_UNREACHABLE; } template void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs) { const uint8_t CompType = Op->Src[1].Literal(); const auto DstSize = GetGuestVectorLength(); const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Op->Dest, DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType & 0b111, false); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs); template void OpDispatchBuilder::InsertScalarFCMPOp(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs) { const uint8_t CompType = Op->Src[2].Literal(); const auto DstSize = GetGuestVectorLength(); const auto SrcSize = OpSizeFromSrc(Op); // We load the full vector width when dealing with a source vector, // so that we don't do any unnecessary zero extension to the scalar // element that we're going to operate on. Ref Src1 = LoadSourceFPR_WithOpSize(Op, Op->Src[0], DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], SrcSize, Op->Flags, {.AllowUpperGarbage = true}); Ref Result = InsertScalarFCMPOpImpl(DstSize, OpSizeFromDst(Op), ElementSize, Src1, Src2, CompType & 0b11111, true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs); template void OpDispatchBuilder::AVXInsertScalarFCMPOp(OpcodeArgs); void OpDispatchBuilder::RSqrt3DNowOp(OpcodeArgs, bool Duplicate) { const auto Size = OpSizeFromSrc(Op); const auto ElementSize = OpSize::i32Bit; Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], Size, Op->Flags); // For the sqrt reciprocal in 3DNow!, if the source is negative, // then the result has the same sign as the source but the result is always calculated // as if the source was positive. Ref AbsSrc = _VFAbs(Size, ElementSize, Src); Ref PosRSqrt = _VFRSqrtPrecision(Size, ElementSize, AbsSrc); Ref Result = _VFCopySign(Size, ElementSize, PosRSqrt, Src); if (Duplicate) { Result = _VDupElement(Size, ElementSize, Result, 0); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { // In the event of a scalar operation and a vector source, then // we can specify the entire vector length in order to avoid // unnecessary sign extension on the element to be operated on. // In the event of a memory operand, we load the exact element size. const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], Size, Op->Flags); DeriveOp(ALUOp, IROp, _VFSqrt(Size, ElementSize, Src)); StoreResultFPR(Op, ALUOp); } void OpDispatchBuilder::AVXVectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { // In the event of a scalar operation and a vector source, then // we can specify the entire vector length in order to avoid // unnecessary sign extension on the element to be operated on. // In the event of a memory operand, we load the exact element size. const auto SrcSize = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); DeriveOp(ALUOp, IROp, _VFSqrt(SrcSize, ElementSize, Src)); // NOTE: We don't need to clear the upper lanes here, since the // IR ops make use of 128-bit AdvSimd for 128-bit cases, // which, on hardware with SVE, zero-extends as part of // storing into the destination. StoreResultFPR(Op, ALUOp); } void OpDispatchBuilder::VectorUnaryDuplicateOpImpl(OpcodeArgs, IROps IROp, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); DeriveOp(ALUOp, IROp, _VFSqrt(ElementSize, ElementSize, Src)); // Duplicate the lower bits auto Result = _VDupElement(Size, ElementSize, ALUOp, 0); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs) { VectorUnaryDuplicateOpImpl(Op, IROp, ElementSize); } // TODO: there's only one instantiation of this template. Lets remove it. template void OpDispatchBuilder::VectorUnaryDuplicateOp(OpcodeArgs); void OpDispatchBuilder::MOVQOp(OpcodeArgs, VectorOpType VectorType) { const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); // This instruction is a bit special that if the destination is a register then it'll ZEXT the 64bit source to 128bit if (Op->Dest.IsGPR()) { const auto gpr = Op->Dest.Data.GPR.GPR; const auto gprIndex = gpr - X86State::REG_XMM_0; auto Reg = VZeroExtendOperand(OpSize::i64Bit, Op->Src[0], Src); StoreXMMRegister_WithAVXInsert(VectorType, gprIndex, Reg); } else { // This is simple, just store the result StoreResultFPR(Op, Src); } } void OpDispatchBuilder::MOVQMMXOp(OpcodeArgs) { // Partial store into bottom 64-bits, leave the upper bits unaffected. if (MMXState == MMXState_X87) { ChgStateX87_MMX(); } Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit}); StoreResultFPR(Op, Src, OpSize::i8Bit); } void OpDispatchBuilder::MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); const auto NumElements = IR::NumElements(Size, ElementSize); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); if (Size == OpSize::i128Bit && ElementSize == OpSize::i64Bit) { // UnZip2 the 64-bit elements as 32-bit to get the sign bits closer. // Sign bits are now in bit positions 31 and 63 after this. Src = _VUnZip2(Size, OpSize::i32Bit, Src, Src); // Extract the low 64-bits to GPR in one move. Ref GPR = _VExtractToGPR(Size, OpSize::i64Bit, Src, 0); // BFI the sign bit in 31 in to 62. // Inserting the full lower 32-bits offset 31 so the sign bit ends up at offset 63. GPR = _Bfi(OpSize::i64Bit, 32, 31, GPR, GPR); // Shift right to only get the two sign bits we care about. GPR = _Lshr(OpSize::i64Bit, GPR, Constant(62)); StoreResultGPR_WithOpSize(Op, Op->Dest, GPR, GetGPROpSize()); } else if (Size == OpSize::i128Bit && ElementSize == OpSize::i32Bit) { // Shift all the sign bits to the bottom of their respective elements. Src = _VUShrI(Size, OpSize::i32Bit, Src, 31); // Load the specific 128-bit movmskps shift elements operator. auto ConstantUSHL = LoadAndCacheNamedVectorConstant(Size, NAMED_VECTOR_MOVMSKPS_SHIFT); // Shift the sign bits in to specific locations. Src = _VUShl(Size, OpSize::i32Bit, Src, ConstantUSHL, false); // Add across the vector so the sign bits will end up in bits [3:0] Src = _VAddV(Size, OpSize::i32Bit, Src); // Extract to a GPR. Ref GPR = _VExtractToGPR(Size, OpSize::i32Bit, Src, 0); StoreResultGPR_WithOpSize(Op, Op->Dest, GPR, GetGPROpSize()); } else { Ref CurrentVal = Constant(0); for (unsigned i = 0; i < NumElements; ++i) { // Extract the top bit of the element Ref Tmp = _VExtractToGPR(Size, ElementSize, Src, i); Tmp = _Bfe(ElementSize, 1, IR::OpSizeAsBits(ElementSize) - 1, Tmp); // Shift it to the correct location and or it with the current value if (i != 0) { CurrentVal = _Orlshl(OpSize::i64Bit, CurrentVal, Tmp, i); } else { CurrentVal = Tmp; } } StoreResultGPR(Op, CurrentVal); } } void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is256Bit = SrcSize == OpSize::i256Bit; const auto ExtractSize = Is256Bit ? OpSize::i32Bit : OpSize::i16Bit; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref VMask = LoadAndCacheNamedVectorConstant(SrcSize, NAMED_VECTOR_MOVMASKB); auto VCMP = _VCMPLTZ(SrcSize, OpSize::i8Bit, Src); auto VAnd = _VAnd(SrcSize, OpSize::i8Bit, VCMP, VMask); // Since we also handle the MM MOVMSKB here too, // we need to clamp the lower bound. const auto VAdd1Size = std::max(SrcSize, OpSize::i128Bit); const auto VAdd2Size = std::max(SrcSize >> 1, OpSize::i64Bit); auto VAdd1 = _VAddP(VAdd1Size, OpSize::i8Bit, VAnd, VAnd); auto VAdd2 = _VAddP(VAdd2Size, OpSize::i8Bit, VAdd1, VAdd1); auto VAdd3 = _VAddP(OpSize::i64Bit, OpSize::i8Bit, VAdd2, VAdd2); auto Result = _VExtractToGPR(SrcSize, ExtractSize, VAdd3, 0); StoreResultGPR(Op, Result); } void OpDispatchBuilder::PUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto ALUOp = _VZip(Size, ElementSize, Dest, Src); StoreResultFPR(Op, ALUOp); } void OpDispatchBuilder::VPUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is128Bit = SrcSize == OpSize::i128Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result {}; if (Is128Bit) { Result = _VZip(SrcSize, ElementSize, Src1, Src2); } else { Ref ZipLo = _VZip(SrcSize, ElementSize, Src1, Src2); Ref ZipHi = _VZip2(SrcSize, ElementSize, Src1, Src2); Result = _VInsElement(SrcSize, OpSize::i128Bit, 1, 0, ZipLo, ZipHi); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::PUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto ALUOp = _VZip2(Size, ElementSize, Dest, Src); StoreResultFPR(Op, ALUOp); } void OpDispatchBuilder::VPUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is128Bit = SrcSize == OpSize::i128Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result {}; if (Is128Bit) { Result = _VZip2(SrcSize, ElementSize, Src1, Src2); } else { Ref ZipLo = _VZip(SrcSize, ElementSize, Src1, Src2); Ref ZipHi = _VZip2(SrcSize, ElementSize, Src1, Src2); Result = _VInsElement(SrcSize, OpSize::i128Bit, 0, 1, ZipHi, ZipLo); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::GeneratePSHUFBMask(IR::OpSize SrcSize) { // PSHUFB doesn't 100% match VTBL behaviour // VTBL will set the element zero if the index is greater than // the number of elements in the array // // Bit 7 is the only bit that is supposed to set elements to zero with PSHUFB // Mask the selection bits and top bit correctly // Bits [6:4] is reserved for 128-bit/256-bit // Bits [6:3] is reserved for 64-bit const uint8_t MaskImm = SrcSize == OpSize::i64Bit ? 0b1000'0111 : 0b1000'1111; return _VectorImm(SrcSize, OpSize::i8Bit, MaskImm); } Ref OpDispatchBuilder::PSHUFBOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, Ref MaskVector) { const auto Is256Bit = SrcSize == OpSize::i256Bit; // We perform the 256-bit version as two 128-bit operations due to // the lane splitting behavior, so cap the maximum size at 16. const auto SanitizedSrcSize = std::min(SrcSize, OpSize::i128Bit); Ref MaskedIndices = _VAnd(SrcSize, SrcSize, Src2, MaskVector); Ref Low = _VTBL1(SanitizedSrcSize, Src1, MaskedIndices); if (!Is256Bit) { return Low; } Ref HighSrc1 = _VInsElement(SrcSize, OpSize::i128Bit, 0, 1, Src1, Src1); Ref High = _VTBL1(SanitizedSrcSize, HighSrc1, MaskedIndices); return _VInsElement(SrcSize, OpSize::i128Bit, 1, 0, Low, High); } void OpDispatchBuilder::PSHUFBOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSHUFBOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PSHUFBOpImpl(SrcSize, Src1, Src2, GeneratePSHUFBMask(SrcSize)); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PShufWLane(IR::OpSize Size, FEXCore::IR::IndexNamedVectorConstant IndexConstant, bool LowLane, Ref IncomingLane, uint8_t Shuffle) { constexpr auto IdentityCopy = 0b11'10'01'00; const bool Is128BitLane = Size == OpSize::i128Bit; const auto NumElements = IR::NumElements(Size, IR::OpSize::i16Bit); const auto HalfNumElements = NumElements >> 1; // TODO: There can be more optimized copies here. switch (Shuffle) { case IdentityCopy: { // Special case identity copy. return IncomingLane; } case 0b00'00'00'00: case 0b01'01'01'01: case 0b10'10'10'10: case 0b11'11'11'11: { // Special case element duplicate and broadcast to low or high 64-bits. Ref Dup = _VDupElement(Size, OpSize::i16Bit, IncomingLane, (LowLane ? 0 : HalfNumElements) + (Shuffle & 0b11)); if (Is128BitLane) { if (LowLane) { // DUP goes low. // Source goes high. Dup = _VTrn2(Size, OpSize::i64Bit, Dup, IncomingLane); } else { // DUP goes high. // Source goes low. Dup = _VTrn(Size, OpSize::i64Bit, IncomingLane, Dup); } } return Dup; } default: { // PSHUFLW needs to scale index by 16. // PSHUFHW needs to scale index by 16. // PSHUFW (mmx) also needs to scale by 16 to get correct low element. auto LookupIndexes = LoadAndCacheIndexedNamedVectorConstant(Size, IndexConstant, Shuffle * 16); return _VTBL1(Size, IncomingLane, LookupIndexes); } } } void OpDispatchBuilder::PSHUFW8ByteOp(OpcodeArgs) { uint16_t Shuffle = Op->Src[1].Data.Literal.Value; const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Dest = PShufWLane(Size, FEXCore::IR::INDEXED_NAMED_VECTOR_PSHUFLW, true, Src, Shuffle); StoreResultFPR(Op, Dest); } void OpDispatchBuilder::PSHUFWOp(OpcodeArgs, bool Low) { uint16_t Shuffle = Op->Src[1].Data.Literal.Value; const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const auto IndexedVectorConstant = Low ? FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW : FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW; Ref Dest = PShufWLane(Size, IndexedVectorConstant, Low, Src, Shuffle); StoreResultFPR(Op, Dest); } Ref OpDispatchBuilder::Single128Bit4ByteVectorShuffle(Ref Src, uint8_t Shuffle) { constexpr auto IdentityCopy = 0b11'10'01'00; // TODO: There can be more optimized copies here. switch (Shuffle) { case IdentityCopy: { // Special case identity copy. return Src; } case 0b00'00'00'00: case 0b01'01'01'01: case 0b10'10'10'10: case 0b11'11'11'11: { // Special case element duplicate and broadcast to low or high 64-bits. return _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Src, Shuffle & 0b11); } case 0b00'00'10'10: { // Weird reverse low elements and broadcast to each half of the register Ref Tmp = _VUnZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b00'00'11'10: { // First element duplicated and shifted in to the top. auto Dup = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Src, 0); return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Dup, Src, 2); } case 0b00'01'00'01: { ///< Weird reversed low elements and broadcast Ref Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src); return _VZip(OpSize::i128Bit, OpSize::i64Bit, Tmp, Tmp); } case 0b00'01'01'00: { ///< Weird reverse low two elements in to high half Ref Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b00'01'10'11: { // Inverse elements Ref Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src); return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2); } case 0b00'10'00'10: { ///< Weird reversed even elements and broadcast Ref Tmp = _VUnZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b00'10'10'00: { // Weird reversed low elements in upper half of the register Ref Tmp = _VUnZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b00'11'00'11: { ///< Weird Low plus high element reversed and broadcast Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); return _VZip2(OpSize::i128Bit, OpSize::i64Bit, Tmp, Tmp); } case 0b00'11'10'01: ///< Vector rotate - One element return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); case 0b00'11'11'00: { // Weird reversed low and high elements in upper half of the register Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VZip2(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 12); } case 0b01'00'00'01: { ///< Weird duplicate bottom two elements, then rotate in the low half Ref Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 12); } case 0b01'00'01'00: ///< Duplicate bottom 64-bits return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src, 0); case 0b01'00'11'10: ///< Vector rotate - Two elements return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 8); case 0b01'01'00'00: { // Zip with self. // Dest[0] = Src[0] // Dest[1] = Src[0] // Dest[2] = Src[1] // Dest[3] = Src[1] return _VZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); } case 0b01'01'10'10: { ///< Weird reverse middle elements and broadcast to each half of the register Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b01'01'11'11: { ///< Weird reverse odd elements and broadcast to each half of the register Ref Tmp = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b01'10'01'10: { ///< Weird middle elements swizzle plus broadcast Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); return _VZip(OpSize::i128Bit, OpSize::i64Bit, Tmp, Tmp); } case 0b01'10'10'01: { ///< Weird middle elements swizzle plus broadcast and reverse Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b01'11'01'11: { ///< Weird reversed odd elements and broadcast Ref Tmp = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b01'11'11'01: { ///< Weird odd elements swizzle plus broadcast and reverse Ref Tmp = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b10'00'00'10: { ///< Weird even elements swizzle plus broadcast and reverse Ref Tmp = _VUnZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 12); } case 0b10'00'10'00: ///< Even elements broadcast return _VUnZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); case 0b10'01'00'11: ///< Vector rotate - Three elements return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 12); case 0b10'01'01'10: { ///< Weird odd elements swizzle plus broadcast and reverse Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 12); } case 0b10'01'10'01: { ///< Middle two elements broadcast Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); return _VZip(OpSize::i128Bit, OpSize::i64Bit, Tmp, Tmp); } case 0b10'10'00'00: { ///< Broadcast even elements to each half of the register Ref Tmp = _VUnZip(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b10'10'01'01: { ///< Broadcast middle elements to each half of the register Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b10'10'11'11: { ///< Reverse top two elements and broadcast to each half of the register Ref Tmp = _VZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 8); } case 0b10'11'00'01: { // Reverse each 64-bit lane. return _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src); } case 0b10'11'10'11: { ///< Weird top two elements reverse and broadcast Ref Tmp = _VZip2(OpSize::i128Bit, OpSize::i64Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b10'11'11'10: { ///< Weird move top two elements to bottom and reverse in the top half Ref Tmp = _VZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b11'00'00'11: { ///< Weird low plus high elements swizzle plus broadcast and reverse Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VZip2(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b11'00'11'00: { ///< Weird low plus high element broadcast Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 4); Tmp = _VZip2(OpSize::i128Bit, OpSize::i64Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 4); } case 0b11'01'01'11: { ///< Weird odd elements swizzle plus broadcast and reverse Ref Tmp = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); Tmp = _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 12); } case 0b11'01'11'01: ///< Odd elements broadcast return _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); case 0b11'10'10'11: { ///< Rotate top two elements in to bottom half of the register Ref Tmp = _VZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 12); } case 0b11'10'11'10: ///< Duplicate Top 64-bits return _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src, 1); case 0b11'11'00'00: { ///< Weird Broadcast bottom and top element to each half of the register Ref Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Src, 12); Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Tmp); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b11'11'01'01: { ///< Broadcast odd elements to each half of the register Ref Tmp = _VUnZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); return _VZip(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp); } case 0b11'11'10'10: ///< Broadcast top two elements to each half of the register return _VZip2(OpSize::i128Bit, OpSize::i32Bit, Src, Src); default: { // PSHUFD needs to scale index by 16. auto LookupIndexes = LoadAndCacheIndexedNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD, Shuffle * 16); return _VTBL1(OpSize::i128Bit, Src, LookupIndexes); } } } void OpDispatchBuilder::PSHUFDOp(OpcodeArgs) { uint16_t Shuffle = Op->Src[1].Data.Literal.Value; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); StoreResultFPR(Op, Single128Bit4ByteVectorShuffle(Src, Shuffle)); } void OpDispatchBuilder::VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is256Bit = SrcSize == OpSize::i256Bit; auto Shuffle = Op->Src[1].Literal(); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // Note/TODO: With better immediate facilities or vector loading in our IR // much of this can be reduced to setting up a table index register // and then using TBL // // SVE has the INDEX instruction that works essentially like // std::iota (setting a range to an initial value and progressively // incrementing each successive element), so it's well suited for this. // It's just a matter of exposing these facilities in a way that works // well together. // // Should be much nicer than doing repeated inserts in any case. const size_t BaseElement = Low ? 0 : 4; Ref Result = Src; if (Is256Bit) { for (size_t i = 0; i < 4; i++) { const auto Index = Shuffle & 0b11; const auto UpperLaneOffset = IR::NumElements(OpSize::i128Bit, ElementSize); const auto LowDstIndex = BaseElement + i; const auto LowSrcIndex = BaseElement + Index; const auto HighDstIndex = BaseElement + UpperLaneOffset + i; const auto HighSrcIndex = BaseElement + UpperLaneOffset + Index; // Take care of both lanes per iteration Result = _VInsElement(SrcSize, ElementSize, LowDstIndex, LowSrcIndex, Result, Src); Result = _VInsElement(SrcSize, ElementSize, HighDstIndex, HighSrcIndex, Result, Src); Shuffle >>= 2; } } else { for (size_t i = 0; i < 4; i++) { const auto Index = Shuffle & 0b11; Result = _VInsElement(SrcSize, ElementSize, BaseElement + i, BaseElement + Index, Result, Src); Shuffle >>= 2; } } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle) { // Since 256-bit variants and up don't lane cross, we can construct // everything in terms of the 128-variant, as each lane is essentially // its own 128-bit segment. const uint8_t NumElements = IR::NumElements(OpSize::i128Bit, ElementSize); const uint8_t HalfNumElements = NumElements >> 1; const bool Is256Bit = DstSize == OpSize::i256Bit; std::array Srcs {}; for (size_t i = 0; i < HalfNumElements; ++i) { Srcs[i] = Src1; } for (size_t i = HalfNumElements; i < NumElements; ++i) { Srcs[i] = Src2; } Ref Dest = Src1; const uint8_t SelectionMask = NumElements - 1; const uint8_t ShiftAmount = std::popcount(SelectionMask); if (Is256Bit) { for (uint8_t Element = 0; Element < NumElements; ++Element) { const auto SrcIndex1 = Shuffle & SelectionMask; // AVX differs the behavior of VSHUFPD and VSHUFPS. // The same immediate bits are used for both lanes with VSHUFPS, // but VSHUFPD uses different immediate bits for each lane. const auto SrcIndex2 = ElementSize == OpSize::i32Bit ? SrcIndex1 : ((Shuffle >> 2) & SelectionMask); Ref Insert = _VInsElement(DstSize, ElementSize, Element, SrcIndex1, Dest, Srcs[Element]); Dest = _VInsElement(DstSize, ElementSize, Element + NumElements, SrcIndex2 + NumElements, Insert, Srcs[Element]); Shuffle >>= ShiftAmount; } } else { if (ElementSize == OpSize::i32Bit) { // We can shuffle optimally in a lot of cases. // TODO: We can optimize more of these cases. switch (Shuffle) { case 0b01'00'01'00: // Combining of low 64-bits. // Dest[63:0] = Src1[63:0] // Dest[127:64] = Src2[63:0] return _VZip(DstSize, OpSize::i64Bit, Src1, Src2); case 0b11'10'11'10: // Combining of high 64-bits. // Dest[63:0] = Src1[127:64] // Dest[127:64] = Src2[127:64] return _VZip2(DstSize, OpSize::i64Bit, Src1, Src2); case 0b11'10'01'00: // Mixing Low and high elements // Dest[63:0] = Src1[63:0] // Dest[127:64] = Src2[127:64] return _VInsElement(DstSize, OpSize::i64Bit, 1, 1, Src1, Src2); case 0b01'00'11'10: // Mixing Low and high elements, inverse of above // Dest[63:0] = Src1[127:64] // Dest[127:64] = Src2[63:0] return _VExtr(DstSize, OpSize::i8Bit, Src2, Src1, 8); case 0b10'00'10'00: // Mixing even elements. // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src1[95:64] // Dest[95:64] = Src2[31:0] // Dest[127:96] = Src2[95:64] return _VUnZip(DstSize, ElementSize, Src1, Src2); case 0b11'01'11'01: // Mixing odd elements. // Dest[31:0] = Src1[63:32] // Dest[63:32] = Src1[127:96] // Dest[95:64] = Src2[63:32] // Dest[127:96] = Src2[127:96] return _VUnZip2(DstSize, ElementSize, Src1, Src2); case 0b11'10'00'00: case 0b11'10'01'01: case 0b11'10'10'10: case 0b11'10'11'11: { // Bottom elements duplicated, Top 64-bits inserted auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11); return _VZip2(DstSize, OpSize::i64Bit, DupSrc1, Src2); } case 0b01'00'00'00: case 0b01'00'01'01: case 0b01'00'10'10: case 0b01'00'11'11: { // Bottom elements duplicated, Bottom 64-bits inserted auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11); return _VZip(DstSize, OpSize::i64Bit, DupSrc1, Src2); } case 0b00'00'01'00: case 0b01'01'01'00: case 0b10'10'01'00: case 0b11'11'01'00: { // Top elements duplicated, Bottom 64-bits inserted auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11); return _VZip(DstSize, OpSize::i64Bit, Src1, DupSrc2); } case 0b00'00'11'10: case 0b01'01'11'10: case 0b10'10'11'10: case 0b11'11'11'10: { // Top elements duplicated, Top 64-bits inserted auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11); return _VZip2(DstSize, OpSize::i64Bit, Src1, DupSrc2); } case 0b01'00'01'11: { // TODO: This doesn't generate optimal code. // RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences. // With RA fixes this would be 2 instructions. // Odd elements inverted, Low 64-bits inserted Src1 = _VInsElement(DstSize, OpSize::i32Bit, 0, 3, Src1, Src1); return _VZip(DstSize, OpSize::i64Bit, Src1, Src2); } case 0b11'10'01'11: { // TODO: This doesn't generate optimal code. // RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences. // With RA fixes this would be 2 instructions. // Odd elements inverted, Top 64-bits inserted Src1 = _VInsElement(DstSize, OpSize::i32Bit, 0, 3, Src1, Src1); return _VInsElement(DstSize, OpSize::i64Bit, 1, 1, Src1, Src2); } case 0b01'00'00'01: { // Lower 32-bit elements inverted, low 64-bits inserted Src1 = _VRev64(DstSize, OpSize::i32Bit, Src1); return _VZip(DstSize, OpSize::i64Bit, Src1, Src2); } case 0b11'10'00'01: { // TODO: This doesn't generate optimal code. // RA doesn't understand that Src1 is dead after VInsElement due to SRA class differences. // With RA fixes this would be 2 instructions. // Lower 32-bit elements inverted, Top 64-bits inserted Src1 = _VRev64(DstSize, OpSize::i32Bit, Src1); return _VInsElement(DstSize, OpSize::i64Bit, 1, 1, Src1, Src2); } case 0b00'00'00'00: case 0b00'00'01'01: case 0b00'00'10'10: case 0b00'00'11'11: case 0b01'01'00'00: case 0b01'01'01'01: case 0b01'01'10'10: case 0b01'01'11'11: case 0b10'10'00'00: case 0b10'10'01'01: case 0b10'10'10'10: case 0b10'10'11'11: case 0b11'11'00'00: case 0b11'11'01'01: case 0b11'11'10'10: case 0b11'11'11'11: { // Duplicate element in upper and lower across each 64-bit segment. auto DupSrc1 = _VDupElement(DstSize, ElementSize, Src1, Shuffle & 0b11); auto DupSrc2 = _VDupElement(DstSize, ElementSize, Src2, (Shuffle >> 4) & 0b11); return _VZip(DstSize, OpSize::i64Bit, DupSrc1, DupSrc2); } default: // Use a TBL2 operation to handle this implementation. auto LookupIndexes = LoadAndCacheIndexedNamedVectorConstant(DstSize, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_SHUFPS, Shuffle * 16); return _VTBL2(DstSize, Src1, Src2, LookupIndexes); } } else { switch (Shuffle & 0b11) { case 0b00: // Low 64-bits of each source interleaved. return _VZip(DstSize, ElementSize, Src1, Src2); case 0b01: // Upper 64-bits of Src1 in lower bits // Lower 64-bits of Src2 in upper bits. return _VExtr(DstSize, OpSize::i8Bit, Src2, Src1, 8); case 0b10: // Lower 32-bits of Src1 in lower bits. // Upper 64-bits of Src2 in upper bits. return _VInsElement(DstSize, ElementSize, 1, 1, Src1, Src2); case 0b11: // Upper 64-bits of each source interleaved. return _VZip2(DstSize, ElementSize, Src1, Src2); } } for (uint8_t Element = 0; Element < NumElements; ++Element) { const auto SrcIndex = Shuffle & SelectionMask; Dest = _VInsElement(DstSize, ElementSize, Element, SrcIndex, Dest, Srcs[Element]); Shuffle >>= ShiftAmount; } } return Dest; } void OpDispatchBuilder::SHUFOp(OpcodeArgs, IR::OpSize ElementSize) { Ref Src1Node = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2Node = LoadSourceFPR(Op, Op->Src[0], Op->Flags); uint8_t Shuffle = Op->Src[1].Literal(); Ref Result = SHUFOpImpl(Op, OpSizeFromDst(Op), ElementSize, Src1Node, Src2Node, Shuffle); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VSHUFOp(OpcodeArgs, IR::OpSize ElementSize) { Ref Src1Node = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2Node = LoadSourceFPR(Op, Op->Src[1], Op->Flags); uint8_t Shuffle = Op->Src[2].Literal(); Ref Result = SHUFOpImpl(Op, OpSizeFromDst(Op), ElementSize, Src1Node, Src2Node, Shuffle); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VANDNOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Dest = _VAndn(SrcSize, SrcSize, Src2, Src1); StoreResultFPR(Op, Dest); } template void OpDispatchBuilder::VHADDPOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is256Bit = SrcSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); DeriveOp(Res, IROp, _VFAddP(SrcSize, ElementSize, Src1, Src2)); Ref Dest = Res; if (Is256Bit) { Dest = _VInsElement(SrcSize, OpSize::i64Bit, 1, 2, Res, Res); Dest = _VInsElement(SrcSize, OpSize::i64Bit, 2, 1, Dest, Res); } StoreResultFPR(Op, Dest); } template void OpDispatchBuilder::VHADDPOp(OpcodeArgs); template void OpDispatchBuilder::VHADDPOp(OpcodeArgs); template void OpDispatchBuilder::VHADDPOp(OpcodeArgs); template void OpDispatchBuilder::VHADDPOp(OpcodeArgs); void OpDispatchBuilder::VBROADCASTOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); Ref Result {}; if (Op->Src[0].IsGPR()) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Result = _VDupElement(DstSize, ElementSize, Src, 0); } else { // Get the address to broadcast from into a GPR. Ref Address = MakeSegmentAddress(Op, Op->Src[0], GetGPROpSize()); Result = _VBroadcastFromMem(DstSize, ElementSize, Address); } // No need to zero-extend result, since implementations // use zero extending AdvSIMD or zeroing SVE loads internally. StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PINSROpImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm) { const auto Size = OpSizeFromDst(Op); const auto NumElements = IR::NumElements(Size, ElementSize); const uint64_t Index = Imm.Literal() & (NumElements - 1); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, Size, Op->Flags); if (Src2Op.IsGPR()) { // If the source is a GPR then convert directly from the GPR. auto Src2 = LoadSourceGPR_WithOpSize(Op, Src2Op, GetGPROpSize(), Op->Flags); return _VInsGPR(Size, ElementSize, Index, Src1, Src2); } // If loading from memory then we only load the element size Ref Src2 = MakeSegmentAddress(Op, Src2Op); return _VLoadVectorElement(Size, ElementSize, Src1, Index, Src2); } template void OpDispatchBuilder::PINSROp(OpcodeArgs) { Ref Result = PINSROpImpl(Op, ElementSize, Op->Dest, Op->Src[0], Op->Src[1]); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::PINSROp(OpcodeArgs); template void OpDispatchBuilder::PINSROp(OpcodeArgs); template void OpDispatchBuilder::PINSROp(OpcodeArgs); template void OpDispatchBuilder::PINSROp(OpcodeArgs); void OpDispatchBuilder::VPINSRBOp(OpcodeArgs) { Ref Result = PINSROpImpl(Op, OpSize::i8Bit, Op->Src[0], Op->Src[1], Op->Src[2]); if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPINSRDQOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); Ref Result = PINSROpImpl(Op, SrcSize, Op->Src[0], Op->Src[1], Op->Src[2]); if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPINSRWOp(OpcodeArgs) { Ref Result = PINSROpImpl(Op, OpSize::i16Bit, Op->Src[0], Op->Src[1], Op->Src[2]); if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm) { const uint8_t ImmValue = Imm.Literal(); uint8_t CountS = (ImmValue >> 6); uint8_t CountD = (ImmValue >> 4) & 0b11; const uint8_t ZMask = ImmValue & 0xF; const auto DstSize = OpSizeFromDst(Op); Ref Dest {}; if (ZMask != 0xF) { // Only need to load destination if it isn't a full zero Dest = LoadSourceFPR_WithOpSize(Op, Src1, DstSize, Op->Flags); } if ((ZMask & (1 << CountD)) == 0) { // In the case that ZMask overwrites the destination element, then don't even insert Ref Src {}; if (Src2.IsGPR()) { Src = LoadSourceFPR(Op, Src2, Op->Flags); } else { // If loading from memory then CountS is forced to zero CountS = 0; Src = LoadSourceFPR_WithOpSize(Op, Src2, OpSize::i32Bit, Op->Flags); } Dest = _VInsElement(DstSize, OpSize::i32Bit, CountD, CountS, Dest, Src); } // ZMask happens after insert if (ZMask == 0xF) { return LoadZeroVector(DstSize); } if (ZMask) { auto Zero = LoadZeroVector(DstSize); for (size_t i = 0; i < 4; ++i) { if ((ZMask & (1 << i)) != 0) { Dest = _VInsElement(DstSize, OpSize::i32Bit, i, 0, Dest, Zero); } } } return Dest; } void OpDispatchBuilder::InsertPSOp(OpcodeArgs) { Ref Result = InsertPSOpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1]); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VINSERTPSOp(OpcodeArgs) { Ref Result = InsertPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]); StoreResultFPR(Op, Result); } void OpDispatchBuilder::PExtrOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); uint64_t Index = Op->Src[1].Literal(); // Fixup of 32-bit element size. // When the element size is 32-bit then it can be overriden as 64-bit because the encoding of PEXTRD/PEXTRQ // is the same except that REX.W or VEX.W is set to 1. Incredibly frustrating. // Use the destination size as the element size in this case. auto OverridenElementSize = ElementSize; if (ElementSize == OpSize::i32Bit) { OverridenElementSize = DstSize; } // AVX version only operates on 128-bit. const uint8_t NumElements = IR::NumElements(std::min(OpSizeFromSrc(Op), OpSize::i128Bit), OverridenElementSize); Index &= NumElements - 1; if (Op->Dest.IsGPR()) { const auto GPRSize = GetGPROpSize(); // Extract already zero extends the result. Ref Result = _VExtractToGPR(OpSize::i128Bit, OverridenElementSize, Src, Index); StoreResultGPR_WithOpSize(Op, Op->Dest, Result, GPRSize); return; } // If we are storing to memory then we store the size of the element extracted Ref Dest = MakeSegmentAddress(Op, Op->Dest); _VStoreVectorElement(OpSize::i128Bit, OverridenElementSize, Src, Index, Dest); } void OpDispatchBuilder::VEXTRACT128Op(OpcodeArgs) { const auto DstIsXMM = Op->Dest.IsGPR(); const auto StoreSize = DstIsXMM ? OpSize::i256Bit : OpSize::i128Bit; const auto Selector = Op->Src[1].Literal() & 0b1; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // A selector of zero is the same as doing a 128-bit vector move. if (Selector == 0) { Ref Result = DstIsXMM ? _VMov(OpSize::i128Bit, Src) : Src; StoreResultFPR_WithOpSize(Op, Op->Dest, Result, StoreSize); return; } // Otherwise replicate the element and only store the first 128-bits. Ref Result = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Src, Selector); if (DstIsXMM) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR_WithOpSize(Op, Op->Dest, Result, StoreSize); } Ref OpDispatchBuilder::PSIGNImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src1, Ref Src2) { const auto Size = OpSizeFromSrc(Op); Ref Control = _VSQSHL(Size, ElementSize, Src2, IR::OpSizeAsBits(ElementSize) - 1); Control = _VSRSHR(Size, ElementSize, Control, IR::OpSizeAsBits(ElementSize) - 1); return _VMul(Size, ElementSize, Src1, Control); } template void OpDispatchBuilder::PSIGN(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Res = PSIGNImpl(Op, ElementSize, Dest, Src); StoreResultFPR(Op, Res); } template void OpDispatchBuilder::PSIGN(OpcodeArgs); template void OpDispatchBuilder::PSIGN(OpcodeArgs); template void OpDispatchBuilder::PSIGN(OpcodeArgs); template void OpDispatchBuilder::VPSIGN(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Res = PSIGNImpl(Op, ElementSize, Src1, Src2); StoreResultFPR(Op, Res); } template void OpDispatchBuilder::VPSIGN(OpcodeArgs); template void OpDispatchBuilder::VPSIGN(OpcodeArgs); template void OpDispatchBuilder::VPSIGN(OpcodeArgs); Ref OpDispatchBuilder::PSRLDOpImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, Ref ShiftVec) { const auto Size = OpSizeFromSrc(Op); // Incoming element size for the shift source is always 8 return _VUShrSWide(Size, ElementSize, Src, ShiftVec); } void OpDispatchBuilder::PSRLDOp(OpcodeArgs, IR::OpSize ElementSize) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PSRLDOpImpl(Op, ElementSize, Dest, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSRLDOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Shift = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PSRLDOpImpl(Op, ElementSize, Src, Shift); if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::PSRLI(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t ShiftConstant = Op->Src[1].Literal(); if (ShiftConstant == 0) [[unlikely]] { // Nothing to do, value is already in Dest. return; } const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Shift = _VUShrI(Size, ElementSize, Dest, ShiftConstant); StoreResultFPR(Op, Shift); } void OpDispatchBuilder::VPSRLIOp(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); const auto Is128Bit = Size == OpSize::i128Bit; const uint64_t ShiftConstant = Op->Src[1].Literal(); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = Src; if (ShiftConstant != 0) [[likely]] { Result = _VUShrI(Size, ElementSize, Src, ShiftConstant); } else { if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PSLLIImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, uint64_t Shift) { if (Shift == 0) [[unlikely]] { // If zero-shift then just return the source. return Src; } const auto Size = OpSizeFromSrc(Op); return _VShlI(Size, ElementSize, Src, Shift); } void OpDispatchBuilder::PSLLI(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t ShiftConstant = Op->Src[1].Literal(); if (ShiftConstant == 0) [[unlikely]] { // Nothing to do, value is already in Dest. return; } Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Result = PSLLIImpl(Op, ElementSize, Dest, ShiftConstant); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSLLIOp(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t ShiftConstant = Op->Src[1].Literal(); const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PSLLIImpl(Op, ElementSize, Src, ShiftConstant); if (ShiftConstant == 0 && Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PSLLImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, Ref ShiftVec) { const auto Size = OpSizeFromDst(Op); // Incoming element size for the shift source is always 8 return _VUShlSWide(Size, ElementSize, Src, ShiftVec); } void OpDispatchBuilder::PSLL(OpcodeArgs, IR::OpSize ElementSize) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PSLLImpl(Op, ElementSize, Dest, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSLLOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], OpSize::i128Bit, Op->Flags); Ref Result = PSLLImpl(Op, ElementSize, Src1, Src2); if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PSRAOpImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, Ref ShiftVec) { const auto Size = OpSizeFromDst(Op); // Incoming element size for the shift source is always 8 return _VSShrSWide(Size, ElementSize, Src, ShiftVec); } void OpDispatchBuilder::PSRAOp(OpcodeArgs, IR::OpSize ElementSize) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PSRAOpImpl(Op, ElementSize, Dest, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSRAOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PSRAOpImpl(Op, ElementSize, Src1, Src2); if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::PSRLDQ(OpcodeArgs) { const uint64_t Shift = Op->Src[1].Literal(); if (Shift == 0) [[unlikely]] { // Nothing to do, value is already in Dest. return; } const auto Size = OpSizeFromDst(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Result = LoadZeroVector(Size); if (Shift < IR::OpSizeToSize(Size)) { Result = _VExtr(Size, OpSize::i8Bit, Result, Dest, Shift); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSRLDQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is128Bit = DstSize == OpSize::i128Bit; const uint64_t Shift = Op->Src[1].Literal(); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result {}; if (Shift == 0) [[unlikely]] { if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Src); } else { Result = Src; } } else { Result = LoadZeroVector(DstSize); if (Is128Bit) { if (Shift < IR::OpSizeToSize(DstSize)) { Result = _VExtr(DstSize, OpSize::i8Bit, Result, Src, Shift); } } else { if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) { Ref ResultBottom = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Result, Src, Shift); Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Result, Src, 16 + Shift); Result = _VInsElement(DstSize, OpSize::i128Bit, 1, 0, ResultBottom, ResultTop); } } } StoreResultFPR(Op, Result); } void OpDispatchBuilder::PSLLDQ(OpcodeArgs) { const uint64_t Shift = Op->Src[1].Literal(); if (Shift == 0) [[unlikely]] { // Nothing to do, value is already in Dest. return; } const auto Size = OpSizeFromDst(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Result = LoadZeroVector(Size); if (Shift < IR::OpSizeToSize(Size)) { Result = _VExtr(Size, OpSize::i8Bit, Dest, Result, IR::OpSizeToSize(Size) - Shift); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSLLDQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto DstSizeInt = IR::OpSizeToSize(DstSize); const auto Is128Bit = DstSize == OpSize::i128Bit; const uint64_t Shift = Op->Src[1].Literal(); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = Src; if (Shift == 0) { if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } } else { Result = LoadZeroVector(DstSize); if (Is128Bit) { if (Shift < DstSizeInt) { Result = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift); } } else { if (Shift < Core::CPUState::XMM_SSE_REG_SIZE) { Ref ResultBottom = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, Result, 16 - Shift); Ref ResultTop = _VExtr(DstSize, OpSize::i8Bit, Src, Result, DstSizeInt - Shift); Result = _VInsElement(DstSize, OpSize::i128Bit, 1, 0, ResultBottom, ResultTop); } } } StoreResultFPR(Op, Result); } void OpDispatchBuilder::PSRAIOp(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t Shift = Op->Src[1].Literal(); if (Shift == 0) [[unlikely]] { // Nothing to do, value is already in Dest. return; } const auto Size = OpSizeFromDst(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Result = _VSShrI(Size, ElementSize, Dest, Shift); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSRAIOp(OpcodeArgs, IR::OpSize ElementSize) { const uint64_t Shift = Op->Src[1].Literal(); const auto Size = OpSizeFromDst(Op); const auto Is128Bit = Size == OpSize::i128Bit; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = Src; if (Shift != 0) [[likely]] { Result = _VSShrI(Size, ElementSize, Src, Shift); } else { if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } } StoreResultFPR(Op, Result); } void OpDispatchBuilder::AVXVariableShiftImpl(OpcodeArgs, IROps IROp) { const auto DstSize = OpSizeFromDst(Op); const auto SrcSize = OpSizeFromSrc(Op); Ref Vector = LoadSourceFPR_WithOpSize(Op, Op->Src[0], DstSize, Op->Flags); Ref ShiftVector = LoadSourceFPR_WithOpSize(Op, Op->Src[1], DstSize, Op->Flags); DeriveOp(Shift, IROp, _VUShr(DstSize, SrcSize, Vector, ShiftVector, true)); StoreResultFPR(Op, Shift); } void OpDispatchBuilder::VPSLLVOp(OpcodeArgs) { AVXVariableShiftImpl(Op, IROps::OP_VUSHL); } void OpDispatchBuilder::VPSRAVDOp(OpcodeArgs) { AVXVariableShiftImpl(Op, IROps::OP_VSSHR); } void OpDispatchBuilder::VPSRLVOp(OpcodeArgs) { AVXVariableShiftImpl(Op, IROps::OP_VUSHR); } void OpDispatchBuilder::MOVDDUPOp(OpcodeArgs) { // If loading a vector, use the full size, so we don't // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); Ref Res = _VDupElement(OpSize::i128Bit, OpSizeFromSrc(Op), Src, 0); StoreResultFPR(Op, Res); } void OpDispatchBuilder::VMOVDDUPOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto IsSrcGPR = Op->Src[0].IsGPR(); const auto Is256Bit = SrcSize == OpSize::i256Bit; const auto MemSize = Is256Bit ? OpSize::i256Bit : OpSize::i64Bit; const auto LoadSize = IsSrcGPR ? SrcSize : MemSize; Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags); Ref Res {}; if (Is256Bit) { Res = _VTrn(SrcSize, OpSize::i64Bit, Src, Src); } else { Res = _VDupElement(SrcSize, OpSize::i64Bit, Src, 0); } StoreResultFPR(Op, Res); } Ref OpDispatchBuilder::CVTGPR_To_FPRImpl(OpcodeArgs, IR::OpSize DstElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) { const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, OpSize::i128Bit, Op->Flags); Ref Converted {}; if (Src2Op.IsGPR()) { // If the source is a GPR then convert directly from the GPR. auto Src2 = LoadSourceGPR_WithOpSize(Op, Src2Op, GetGPROpSize(), Op->Flags); Converted = _Float_FromGPR_S(DstElementSize, SrcSize, Src2); } else if (SrcSize != DstElementSize) { // If the source is from memory but the Source size and destination size aren't the same, // then it is more optimal to load in to a GPR and convert between GPR->FPR. // ARM GPR->FPR conversion supports different size source and destinations while FPR->FPR doesn't. auto Src2 = LoadSourceGPR(Op, Src2Op, Op->Flags); Converted = _Float_FromGPR_S(DstElementSize, SrcSize, Src2); } else { // In the case of cvtsi2s{s,d} where the source and destination are the same size, // then it is more optimal to load in to the FPR register directly and convert there. auto Src2 = LoadSourceFPR(Op, Src2Op, Op->Flags); Converted = _Vector_SToF(SrcSize, SrcSize, Src2); } return _VInsElement(OpSize::i128Bit, DstElementSize, 0, 0, Src1, Converted); } template void OpDispatchBuilder::CVTGPR_To_FPR(OpcodeArgs) { Ref Result = CVTGPR_To_FPRImpl(Op, DstElementSize, Op->Dest, Op->Src[0]); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::CVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::CVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::AVXCVTGPR_To_FPR(OpcodeArgs) { Ref Result = CVTGPR_To_FPRImpl(Op, DstElementSize, Op->Src[0], Op->Src[1]); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::AVXCVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::AVXCVTGPR_To_FPR(OpcodeArgs); Ref OpDispatchBuilder::CVTFPR_To_GPRImpl(OpcodeArgs, Ref Src, IR::OpSize SrcElementSize, bool HostRoundingMode) { // GPR size is determined by REX.W // Source Element size is determined by instruction const auto GPRSize = OpSizeFromDst(Op); if (CTX->HostFeatures.SupportsFRINTTS) { // When we have FRINTTS, this is a two-step process. First, we round to the // right integer (where _Vector_FToISized matches x86 semantics), then just // convert that to a GPR. Src = _Vector_FToISized(SrcElementSize, SrcElementSize, Src, HostRoundingMode, GPRSize); return _Float_ToGPR_ZS(GPRSize, SrcElementSize, Src); } else { // When we lack hardware support, we need a bit of a convoluted sequence of // fixups before before and after conversion to emulate x86 semantics. if (HostRoundingMode) { Src = _Vector_FToI(SrcElementSize, SrcElementSize, Src, RoundMode::Host); } Ref Converted = _Float_ToGPR_ZS(GPRSize, SrcElementSize, Src); bool Dst32 = GPRSize == OpSize::i32Bit; Ref MaxI = Dst32 ? Constant(0x80000000) : Constant(0x8000000000000000); Ref MaxF = LoadAndCacheNamedVectorConstant(SrcElementSize, (SrcElementSize == OpSize::i32Bit) ? (Dst32 ? NAMED_VECTOR_CVTMAX_F32_I32 : NAMED_VECTOR_CVTMAX_F32_I64) : (Dst32 ? NAMED_VECTOR_CVTMAX_F64_I32 : NAMED_VECTOR_CVTMAX_F64_I64)); return _Select(GPRSize, SrcElementSize, CondClass::FGT, MaxF, Src, Converted, MaxI); } } template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs) { // If loading a vector, use the full size, so we don't // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : SrcElementSize; Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); Ref Result = CVTFPR_To_GPRImpl(Op, Src, SrcElementSize, HostRoundingMode); StoreResultGPR(Op, Result); } template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs); template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs); template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs); template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs); Ref OpDispatchBuilder::Vector_CVT_Int_To_FloatImpl(OpcodeArgs, IR::OpSize SrcElementSize, bool Widen) { const auto Size = OpSizeFromDst(Op); Ref Src = [&] { if (Widen) { // If loading a vector, use the full size, so we don't // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. const auto LoadSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : IR::SizeToOpSize(8 * (IR::OpSizeToSize(Size) / 16)); return LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags); } else { return LoadSourceFPR(Op, Op->Src[0], Op->Flags); } }(); auto ElementSize = SrcElementSize; if (Widen) { Src = _VSXTL(Size, ElementSize, Src); ElementSize = ElementSize << 1; } return _Vector_SToF(Size, ElementSize, Src); } template void OpDispatchBuilder::Vector_CVT_Int_To_Float(OpcodeArgs) { Ref Result = Vector_CVT_Int_To_FloatImpl(Op, SrcElementSize, Widen); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::Vector_CVT_Int_To_Float(OpcodeArgs); template void OpDispatchBuilder::Vector_CVT_Int_To_Float(OpcodeArgs); Ref OpDispatchBuilder::Vector_CVT_Float_To_Int32Impl(OpcodeArgs, IR::OpSize DstSize, Ref Src, IR::OpSize SrcSize, IR::OpSize SrcElementSize, bool HostRoundingMode, bool ZeroUpperHalf) { if (CTX->HostFeatures.SupportsFRINTTS && SrcSize != OpSize::i256Bit) { // If we have FRINTS, this is the usual 2-step Src = _Vector_FToISized(SrcSize, SrcElementSize, Src, HostRoundingMode, OpSize::i32Bit); Ref Dst = _Vector_FToZS(SrcSize, SrcElementSize, Src); if (SrcElementSize == OpSize::i32Bit) { // Return 32-bit result as-is return Dst; } else { // Down step from 64-bit ints to 32-bit ints return _VUShrNI(DstSize, SrcElementSize, Dst, 0); } } else { // Otherwise, we have to do all the fixups, but vectorized. if (HostRoundingMode) { Src = _Vector_FToI(SrcSize, SrcElementSize, Src, RoundMode::Host); } OpSize OverflowConstSize = ZeroUpperHalf && SrcElementSize == OpSize::i64Bit ? DstSize / 2 : DstSize; Ref MaxI = LoadAndCacheNamedVectorConstant(OverflowConstSize, NAMED_VECTOR_CVTMAX_I32); Ref Converted {}, Cmp {}; if (SrcElementSize == OpSize::i64Bit) { Ref MaxF = LoadAndCacheNamedVectorConstant(SrcSize, NAMED_VECTOR_CVTMAX_F64_I32); Converted = _Vector_F64ToI32(DstSize, Src, RoundMode::TowardsZero, ZeroUpperHalf); Cmp = _VFCMPGT(SrcSize, OpSize::i64Bit, MaxF, Src); Cmp = _VUShrNI(DstSize, OpSize::i64Bit, Cmp, 32); } else { Ref MaxF = LoadAndCacheNamedVectorConstant(DstSize, NAMED_VECTOR_CVTMAX_F32_I32); Converted = _Vector_FToZS(DstSize, OpSize::i32Bit, Src); Cmp = _VFCMPGT(DstSize, OpSize::i32Bit, MaxF, Src); } return _VBSL(DstSize, Cmp, Converted, MaxI); } } template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = Vector_CVT_Float_To_Int32Impl(Op, DstSize, Src, OpSizeFromSrc(Op), SrcElementSize, HostRoundingMode, true); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); Ref OpDispatchBuilder::Scalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) { // In the case of vectors, we can just specify the full vector length, // so that we don't unnecessarily zero-extend the entire vector. // Otherwise, if it's a memory load, then we only want to load its exact size. const auto Src2Size = Src2Op.IsGPR() ? OpSize::i128Bit : SrcElementSize; Ref Src1 = LoadSourceFPR_WithOpSize(Op, Src1Op, OpSize::i128Bit, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Src2Op, Src2Size, Op->Flags); Ref Converted = _Float_FToF(DstElementSize, SrcElementSize, Src2); return _VInsElement(OpSize::i128Bit, DstElementSize, 0, 0, Src1, Converted); } template void OpDispatchBuilder::Scalar_CVT_Float_To_Float(OpcodeArgs) { Ref Result = Scalar_CVT_Float_To_FloatImpl(Op, DstElementSize, SrcElementSize, Op->Dest, Op->Src[0]); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::Scalar_CVT_Float_To_Float(OpcodeArgs); template void OpDispatchBuilder::Scalar_CVT_Float_To_Float(OpcodeArgs); template void OpDispatchBuilder::AVXScalar_CVT_Float_To_Float(OpcodeArgs) { Ref Result = Scalar_CVT_Float_To_FloatImpl(Op, DstElementSize, SrcElementSize, Op->Src[0], Op->Src[1]); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::AVXScalar_CVT_Float_To_Float(OpcodeArgs); template void OpDispatchBuilder::AVXScalar_CVT_Float_To_Float(OpcodeArgs); void OpDispatchBuilder::Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, bool IsAVX) { const auto SrcSize = OpSizeFromSrc(Op); const auto IsFloatSrc = SrcElementSize == OpSize::i32Bit; const auto Is128Bit = SrcSize == OpSize::i128Bit; const auto LoadSize = IsFloatSrc && !Op->Src[0].IsGPR() ? (SrcSize >> 1) : SrcSize; Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags); Ref Result {}; if (DstElementSize > SrcElementSize) { Result = _Vector_FToF(SrcSize, SrcElementSize << 1, Src, SrcElementSize); } else { Result = _Vector_FToF(SrcSize, SrcElementSize >> 1, Src, SrcElementSize); } if (IsAVX) { if (!IsFloatSrc && !Is128Bit) { // VCVTPD2PS path Result = _VMov(OpSize::i128Bit, Result); } else if (IsFloatSrc && Is128Bit) { // VCVTPS2PD path Result = _VMov(OpSize::i128Bit, Result); } } StoreResultFPR(Op, Result); } void OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // Always 32-bit. auto ElementSize = OpSize::i32Bit; const auto DstSize = OpSizeFromDst(Op); Src = _VSXTL(DstSize, ElementSize, Src); ElementSize = ElementSize << 1; // Always signed Src = _Vector_SToF(DstSize, ElementSize, Src); StoreResultFPR(Op, Src); } template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs) { // This function causes a change in MMX state from X87 to MMX if (MMXState == MMXState_X87) { ChgStateX87_MMX(); } // If loading a vector, use the full size, so we don't // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); Ref Result = Vector_CVT_Float_To_Int32Impl(Op, DstSize, Src, SrcSize, SrcElementSize, HostRoundingMode, false /* TODO? */); StoreResultFPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); void OpDispatchBuilder::MASKMOVOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref MaskSrc = LoadSourceGPR(Op, Op->Src[0], Op->Flags); // Mask only cares about the top bit of each byte MaskSrc = _VCMPLTZ(Size, OpSize::i8Bit, MaskSrc); // Vector that will overwrite byte elements. Ref VectorSrc = LoadSourceGPR(Op, Op->Dest, Op->Flags); // RDI source (DS prefix by default) auto MemDest = MakeSegmentAddress(X86State::REG_RDI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); Ref XMMReg = _LoadMemFPR(Size, MemDest, OpSize::i8Bit); // If the Mask element high bit is set then overwrite the element with the source, else keep the memory variant XMMReg = _VBSL(Size, MaskSrc, VectorSrc, XMMReg); _StoreMemFPR(Size, MemDest, XMMReg, OpSize::i8Bit); } void OpDispatchBuilder::VMASKMOVOpImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DataSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp) { const auto MakeAddress = [this, Op](const X86Tables::DecodedOperand& Data) { return MakeSegmentAddress(Op, Data, GetGPROpSize()); }; Ref Mask = LoadSourceFPR_WithOpSize(Op, MaskOp, DataSize, Op->Flags); if (IsStore) { Ref Data = LoadSourceFPR_WithOpSize(Op, DataOp, DataSize, Op->Flags); Ref Address = MakeAddress(Op->Dest); _VStoreVectorMasked(DataSize, ElementSize, Mask, Data, Address, Invalid(), MemOffsetType::SXTX, 1); } else { const auto Is128Bit = GetDstSize(Op) == Core::CPUState::XMM_SSE_REG_SIZE; Ref Address = MakeAddress(DataOp); Ref Result = _VLoadVectorMasked(DataSize, ElementSize, Mask, Address, Invalid(), MemOffsetType::SXTX, 1); if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } } template void OpDispatchBuilder::VMASKMOVOp(OpcodeArgs) { VMASKMOVOpImpl(Op, ElementSize, OpSizeFromDst(Op), IsStore, Op->Src[0], Op->Src[1]); } template void OpDispatchBuilder::VMASKMOVOp(OpcodeArgs); template void OpDispatchBuilder::VMASKMOVOp(OpcodeArgs); template void OpDispatchBuilder::VMASKMOVOp(OpcodeArgs); template void OpDispatchBuilder::VMASKMOVOp(OpcodeArgs); template void OpDispatchBuilder::VPMASKMOVOp(OpcodeArgs) { VMASKMOVOpImpl(Op, OpSizeFromSrc(Op), OpSizeFromDst(Op), IsStore, Op->Src[0], Op->Src[1]); } template void OpDispatchBuilder::VPMASKMOVOp(OpcodeArgs); template void OpDispatchBuilder::VPMASKMOVOp(OpcodeArgs); void OpDispatchBuilder::MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType) { if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= FEXCore::X86State::REG_XMM_0) { Ref Result {}; if (Op->Src[0].IsGPR()) { // Loading from GPR and moving to Vector. Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], GetGPROpSize(), Op->Flags); // zext to 128bit Result = _VCastFromGPR(OpSize::i128Bit, OpSizeFromSrc(Op), Src); } else { // Loading from Memory as a scalar. Zero extend Result = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } StoreResult_WithAVXInsert(VectorType, RegClass::FPR, Op, Result); } else { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); if (Op->Dest.IsGPR()) { const auto ElementSize = OpSizeFromDst(Op); // Extract element from GPR. Zero extending in the process. Src = _VExtractToGPR(OpSizeFromSrc(Op), ElementSize, Src, 0); StoreResultGPR(Op, Op->Dest, Src); } else { // Storing first element to memory. Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); _StoreMemFPR(OpSizeFromDst(Op), Dest, Src, OpSize::i8Bit); } } } Ref OpDispatchBuilder::VFCMPOpImpl(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t CompType) { switch (static_cast(CompType)) { case VectorCompareType::EQ_OQ: case VectorCompareType::EQ_OS: return _VFCMPEQ(Size, ElementSize, Src1, Src2); case VectorCompareType::LT_OS: // GT(Swapped operand) case VectorCompareType::LT_OQ: return _VFCMPLT(Size, ElementSize, Src1, Src2); case VectorCompareType::LE_OS: // GE(Swapped operand) case VectorCompareType::LE_OQ: return _VFCMPLE(Size, ElementSize, Src1, Src2); case VectorCompareType::UNORD_Q: case VectorCompareType::UNORD_S: return _VFCMPUNO(Size, ElementSize, Src1, Src2); case VectorCompareType::NEQ_UQ: case VectorCompareType::NEQ_US: return _VFCMPNEQ(Size, ElementSize, Src1, Src2); case VectorCompareType::NLT_US: // NGT(Swapped operand) case VectorCompareType::NLT_UQ: { Ref Result = _VFCMPLT(Size, ElementSize, Src1, Src2); return _VNot(Size, ElementSize, Result); } case VectorCompareType::NLE_US: // NGE(Swapped operand) case VectorCompareType::NLE_UQ: { Ref Result = _VFCMPLE(Size, ElementSize, Src1, Src2); return _VNot(Size, ElementSize, Result); } case VectorCompareType::ORD_Q: case VectorCompareType::ORD_S: return _VFCMPORD(Size, ElementSize, Src1, Src2); case VectorCompareType::NGT_UQ: case VectorCompareType::NGT_US: { Ref Result = _VFCMPLT(Size, ElementSize, Src2, Src1); return _VNot(Size, ElementSize, Result); } case VectorCompareType::NGE_UQ: case VectorCompareType::NGE_US: { Ref Result = _VFCMPLE(Size, ElementSize, Src2, Src1); return _VNot(Size, ElementSize, Result); } case VectorCompareType::GT_OQ: case VectorCompareType::GT_OS: return _VFCMPLT(Size, ElementSize, Src2, Src1); case VectorCompareType::GE_OQ: case VectorCompareType::GE_OS: return _VFCMPLE(Size, ElementSize, Src2, Src1); case VectorCompareType::EQ_UQ: case VectorCompareType::EQ_US: { // If either of the sources are unordered, then returns true. Ref Src1_U = _VFCMPEQ(Size, ElementSize, Src1, Src1); Ref Src2_U = _VFCMPEQ(Size, ElementSize, Src2, Src2); auto Ordered = _VAnd(Size, ElementSize, Src1_U, Src2_U); Ref Compare_Ordered = _VFCMPEQ(Size, ElementSize, Src1, Src2); return _VOrn(Size, ElementSize, Compare_Ordered, Ordered); } case VectorCompareType::NEQ_OQ: case VectorCompareType::NEQ_OS: { // If either of the sources are unordered, then returns false. Ref Src1_U = _VFCMPEQ(Size, ElementSize, Src1, Src1); Ref Src2_U = _VFCMPEQ(Size, ElementSize, Src2, Src2); Ref Compare_Ordered = _VFCMPEQ(Size, ElementSize, Src1, Src2); Ref Result = _VAndn(Size, ElementSize, Src1_U, Compare_Ordered); return _VAnd(Size, ElementSize, Result, Src2_U); } case VectorCompareType::FALSE_OQ: case VectorCompareType::FALSE_OS: return LoadZeroVector(Size); case VectorCompareType::TRUE_UQ: case VectorCompareType::TRUE_US: return _VectorImm(Size, OpSize::i8Bit, -1, 0); } FEX_UNREACHABLE; } template void OpDispatchBuilder::VFCMPOp(OpcodeArgs) { // No need for zero-extending in the scalar case, since // all we need is an insert at the end of the operation. const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, DstSize, Op->Flags); const uint8_t CompType = Op->Src[1].Data.Literal.Value; Ref Result = VFCMPOpImpl(OpSizeFromSrc(Op), ElementSize, Dest, Src, CompType & 0b111); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VFCMPOp(OpcodeArgs); template void OpDispatchBuilder::VFCMPOp(OpcodeArgs); template void OpDispatchBuilder::AVXVFCMPOp(OpcodeArgs) { // No need for zero-extending in the scalar case, since // all we need is an insert at the end of the operation. const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); const uint8_t CompType = Op->Src[2].Literal(); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Op->Src[0], DstSize, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], SrcSize, Op->Flags); Ref Result = VFCMPOpImpl(OpSizeFromSrc(Op), ElementSize, Src1, Src2, CompType & 0b11111); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::AVXVFCMPOp(OpcodeArgs); template void OpDispatchBuilder::AVXVFCMPOp(OpcodeArgs); void OpDispatchBuilder::FXSaveOp(OpcodeArgs) { Ref Mem = MakeSegmentAddress(Op, Op->Dest); SaveX87State(Op, Mem); SaveSSEState(Mem); SaveMXCSRState(Mem); } void OpDispatchBuilder::XSaveOp(OpcodeArgs) { XSaveOpImpl(Op); } Ref OpDispatchBuilder::XSaveBase(X86Tables::DecodedOp Op) { return MakeSegmentAddress(Op, Op->Dest); } void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) { // NOTE: Mask should be EAX and EDX concatenated, but we only need to test // for features that are in the lower 32 bits, so EAX only is sufficient. const auto OpSize = GetGPROpSize(); const auto StoreIfFlagSet = [this, OpSize](uint32_t BitIndex, auto fn, uint32_t FieldSize = 1) { Ref Mask = LoadGPRRegister(X86State::REG_RAX); Ref BitFlag = _Bfe(OpSize, FieldSize, BitIndex, Mask); auto CondJump_ = CondJump(BitFlag, CondClass::NEQ); auto StoreBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); SetTrueJumpTarget(CondJump_, StoreBlock); SetCurrentCodeBlock(StoreBlock); StartNewBlock(); { fn(); } auto Jump_ = Jump(); auto NextJumpTarget = CreateNewCodeBlockAfter(StoreBlock); SetJumpTarget(Jump_, NextJumpTarget); SetFalseJumpTarget(CondJump_, NextJumpTarget); SetCurrentCodeBlock(NextJumpTarget); StartNewBlock(); }; // x87 { StoreIfFlagSet(0, [this, Op] { SaveX87State(Op, XSaveBase(Op)); }); } // SSE { StoreIfFlagSet(1, [this, Op] { SaveSSEState(XSaveBase(Op)); }); } // AVX if (CTX->HostFeatures.SupportsAVX) { StoreIfFlagSet(2, [this, Op] { std::invoke(SaveAVXStateFunc, this, XSaveBase(Op)); }); } // We need to save MXCSR and MXCSR_MASK if either SSE or AVX are requested to be saved { StoreIfFlagSet(1, [this, Op] { SaveMXCSRState(XSaveBase(Op)); }, 2); } // Update XSTATE_BV region of the XSAVE header { Ref Base = XSaveBase(Op); // NOTE: We currently only support the first 3 bits (x87, SSE, and AVX) Ref Mask = LoadGPRRegister(X86State::REG_RAX); Ref RequestedFeatures = _Bfe(OpSize, 3, 0, Mask); // XSTATE_BV section of the header is 8 bytes in size, but we only really // care about setting at most 3 bits in the first byte. We zero out the rest. _StoreMemGPR(OpSize::i64Bit, RequestedFeatures, Base, Constant(512), OpSize::i8Bit, MemOffsetType::SXTX, 1); } } void OpDispatchBuilder::SaveX87State(OpcodeArgs, Ref MemBase) { _SyncStackToSlow(); // Saves 512bytes to the memory location provided // Header changes depending on if REX.W is set or not if (Op->Flags & X86Tables::DecodeFlags::FLAG_REX_WIDENING) { // BYTE | 0 1 | 2 3 | 4 | 5 | 6 7 | 8 9 | a b | c d | e f | // ------------------------------------------ // 00 | FCW | FSW | FTW | | FOP | FIP | // 16 | FDP | MXCSR | MXCSR_MASK| } else { // BYTE | 0 1 | 2 3 | 4 | 5 | 6 7 | 8 9 | a b | c d | e f | // ------------------------------------------ // 00 | FCW | FSW | FTW | | FOP | FIP[31:0] | FCS | | // 16 | FDP[31:0] | FDS | | MXCSR | MXCSR_MASK| } { auto FCW = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, FCW)); _StoreMemGPR(OpSize::i16Bit, MemBase, FCW, OpSize::i16Bit); } { _StoreMemGPR(OpSize::i16Bit, ReconstructFSW_Helper(), MemBase, Constant(2), OpSize::i16Bit, MemOffsetType::SXTX, 1); } { // Abridged FTW auto FTW = _LoadContextGPR(OpSize::i8Bit, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); _StoreMemGPR(OpSize::i8Bit, FTW, MemBase, Constant(4), OpSize::i8Bit, MemOffsetType::SXTX, 1); } // BYTE | 0 1 | 2 3 | 4 | 5 | 6 7 | 8 9 | a b | c d | e f | // ------------------------------------------ // 32 | ST0/MM0 | // 48 | ST1/MM1 | // 64 | ST2/MM2 | // 80 | ST3/MM3 | // 96 | ST4/MM4 | // 112 | ST5/MM5 | // 128 | ST6/MM6 | // 144 | ST7/MM7 | // 160 | XMM0 // 173 | XMM1 // 192 | XMM2 // 208 | XMM3 // 224 | XMM4 // 240 | XMM5 // 256 | XMM6 // 272 | XMM7 // 288 | 64BitMode ? : XMM8 // 304 | 64BitMode ? : XMM9 // 320 | 64BitMode ? : XMM10 // 336 | 64BitMode ? : XMM11 // 352 | 64BitMode ? : XMM12 // 368 | 64BitMode ? : XMM13 // 384 | 64BitMode ? : XMM14 // 400 | 64BitMode ? : XMM15 // 416 | // 432 | // 448 | // 464 | Available // 480 | Available // 496 | Available // FCW: x87 FPU control word // FSW: x87 FPU status word // FTW: x87 FPU Tag word (Abridged) // FOP: x87 FPU opcode. Lower 11 bits of the opcode // FIP: x87 FPU instructyion pointer offset // FCS: x87 FPU instruction pointer selector. If CPUID_0000_0007_0000_00000:EBX[bit 13] = 1 then this is deprecated and stores as 0 // FDP: x87 FPU instruction operand (data) pointer offset // FDS: x87 FPU instruction operand (data) pointer selector. Same deprecation as FCS // MXCSR: If OSFXSR bit in CR4 is not set then this may not be saved // MXCSR_MASK: Mask for writes to the MXCSR register // If OSFXSR bit in CR4 is not set than FXSAVE /may/ not save the XMM registers // This is implementation dependent // // x87 registers are stored rotated depending on the current TOP. Ref Top = GetX87Top(); auto SevenConst = Constant(7); const auto LoadSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) { Ref data = _LoadContextFPRIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit)); if (ReducedPrecisionMode) { data = _F80CVTTo(data, OpSize::i64Bit); } _StoreMemFPR(OpSize::i128Bit, data, MemBase, Constant(16 * i + 32), OpSize::i8Bit, MemOffsetType::SXTX, 1); Top = _And(OpSize::i32Bit, Add(OpSize::i32Bit, Top, 1), SevenConst); } } void OpDispatchBuilder::SaveSSEState(Ref MemBase) { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i += 2) { _StoreMemPairFPR(OpSize::i128Bit, LoadXMMRegister(i), LoadXMMRegister(i + 1), MemBase, i * 16 + 160); } } void OpDispatchBuilder::SaveMXCSRState(Ref MemBase) { // Store MXCSR and the mask for all bits. _StoreMemPairGPR(OpSize::i32Bit, GetMXCSR(), Constant(0xFFFF), MemBase, 24); } void OpDispatchBuilder::SaveAVXState(Ref MemBase) { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i += 2) { Ref Upper0 = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, LoadXMMRegister(i + 0), 1); Ref Upper1 = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, LoadXMMRegister(i + 1), 1); _StoreMemPairFPR(OpSize::i128Bit, Upper0, Upper1, MemBase, i * 16 + 576); } } Ref OpDispatchBuilder::GetMXCSR() { Ref MXCSR = _LoadContextGPR(OpSize::i32Bit, offsetof(FEXCore::Core::CPUState, mxcsr)); // Mask out unsupported bits // Keeps FZ, RC, exception masks, and DAZ MXCSR = _And(OpSize::i32Bit, MXCSR, Constant(0xFFC0)); return MXCSR; } void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) { Ref Mem = MakeSegmentAddress(Op, Op->Src[0]); RestoreX87State(Mem); RestoreSSEState(Mem); Ref MXCSR = _LoadMemGPR(OpSize::i32Bit, Mem, Constant(24), OpSize::i32Bit, MemOffsetType::SXTX, 1); RestoreMXCSRState(MXCSR); } void OpDispatchBuilder::XRstorOpImpl(OpcodeArgs) { const auto OpSize = GetGPROpSize(); // If a bit in our XSTATE_BV is set, then we restore from that region of the XSAVE area, // otherwise, if not set, then we need to set the relevant data the bit corresponds to // to it's defined initial configuration. const auto RestoreIfFlagSetOrDefault = [this, Op, OpSize](uint32_t BitIndex, auto restore_fn, auto default_fn, uint32_t FieldSize = 1) { // Set up base address for the XSAVE region to restore from, and also read // the XSTATE_BV bit flags out of the XSTATE header. // // Note: we rematerialize Base/Mask in each block to avoid crossblock // liveness. Ref Base = XSaveBase(Op); Ref Mask = _LoadMemGPR(OpSize::i64Bit, Base, Constant(512), OpSize::i64Bit, MemOffsetType::SXTX, 1); Ref BitFlag = _Bfe(OpSize, FieldSize, BitIndex, Mask); auto CondJump_ = CondJump(BitFlag, CondClass::NEQ); auto RestoreBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); SetTrueJumpTarget(CondJump_, RestoreBlock); SetCurrentCodeBlock(RestoreBlock); StartNewBlock(); { restore_fn(); } auto RestoreExitJump = Jump(); auto DefaultBlock = CreateNewCodeBlockAfter(RestoreBlock); auto ExitBlock = CreateNewCodeBlockAfter(DefaultBlock); SetJumpTarget(RestoreExitJump, ExitBlock); SetFalseJumpTarget(CondJump_, DefaultBlock); SetCurrentCodeBlock(DefaultBlock); StartNewBlock(); { default_fn(); } auto DefaultExitJump = Jump(); SetJumpTarget(DefaultExitJump, ExitBlock); SetCurrentCodeBlock(ExitBlock); StartNewBlock(); }; // x87 { RestoreIfFlagSetOrDefault(0, [this, Op] { RestoreX87State(XSaveBase(Op)); }, [this, Op] { DefaultX87State(Op); }); } // SSE { RestoreIfFlagSetOrDefault(1, [this, Op] { RestoreSSEState(XSaveBase(Op)); }, [this] { DefaultSSEState(); }); } // AVX if (CTX->HostFeatures.SupportsAVX) { RestoreIfFlagSetOrDefault( 2, [this, Op] { std::invoke(RestoreAVXStateFunc, this, XSaveBase(Op)); }, [this] { std::invoke(DefaultAVXStateFunc, this); }); } { // We need to restore the MXCSR if either SSE or AVX are requested to be saved RestoreIfFlagSetOrDefault( 1, [this, Op] { Ref Base = XSaveBase(Op); Ref MXCSR = _LoadMemGPR(OpSize::i32Bit, Base, Constant(24), OpSize::i32Bit, MemOffsetType::SXTX, 1); RestoreMXCSRState(MXCSR); }, [] { /* Intentionally do nothing*/ }, 2); } } void OpDispatchBuilder::RestoreX87State(Ref MemBase) { _StackForceSlow(); auto NewFCW = _LoadMemGPR(OpSize::i16Bit, MemBase, OpSize::i16Bit); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); { auto NewFSW = _LoadMemGPR(OpSize::i16Bit, MemBase, Constant(2), OpSize::i16Bit, MemOffsetType::SXTX, 1); ReconstructX87StateFromFSW_Helper(NewFSW); } { // Abridged FTW auto NewFTW = _LoadMemGPR(OpSize::i8Bit, MemBase, Constant(4), OpSize::i8Bit, MemOffsetType::SXTX, 1); _StoreContextGPR(OpSize::i8Bit, NewFTW, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); } for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; i += 2) { auto MMRegs = LoadMemPairFPR(OpSize::i128Bit, MemBase, i * 16 + 32); _StoreContextFPR(OpSize::i128Bit, MMRegs.Low, MMBaseOffset() + i * 16); _StoreContextFPR(OpSize::i128Bit, MMRegs.High, MMBaseOffset() + (i + 1) * 16); } } void OpDispatchBuilder::RestoreSSEState(Ref MemBase) { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i += 2) { auto XMMRegs = LoadMemPairFPR(OpSize::i128Bit, MemBase, i * 16 + 160); StoreXMMRegister(i, XMMRegs.Low); StoreXMMRegister(i + 1, XMMRegs.High); } } void OpDispatchBuilder::RestoreMXCSRState(Ref MXCSR) { // Mask out unsupported bits MXCSR = _And(OpSize::i32Bit, MXCSR, Constant(0xFFC0)); _StoreContextGPR(OpSize::i32Bit, MXCSR, offsetof(FEXCore::Core::CPUState, mxcsr)); // We only support the rounding mode and FTZ bit being set Ref RoundingMode = _Bfe(OpSize::i32Bit, 3, 13, MXCSR); _SetRoundingMode(RoundingMode, true, MXCSR); } void OpDispatchBuilder::RestoreAVXState(Ref MemBase) { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i += 2) { Ref XMMReg0 = LoadXMMRegister(i + 0); Ref XMMReg1 = LoadXMMRegister(i + 1); auto YMMHRegs = LoadMemPairFPR(OpSize::i128Bit, MemBase, i * 16 + 576); StoreXMMRegister(i + 0, _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, XMMReg0, YMMHRegs.Low)); StoreXMMRegister(i + 1, _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, XMMReg1, YMMHRegs.High)); } } void OpDispatchBuilder::DefaultX87State(OpcodeArgs) { // We can piggy-back on FNINIT's implementation, since // it performs the same behavior as required by XRSTOR for resetting flags FNINIT(Op); // On top of resetting the flags to a default state, we also need to clear // all of the ST0-7/MM0-7 registers to zero. Ref ZeroVector = LoadZeroVector(OpSize::i64Bit); for (uint32_t i = 0; i < Core::CPUState::NUM_MMS; ++i) { _StoreContextFPR(OpSize::i128Bit, ZeroVector, MMBaseOffset() + i * 16); } } void OpDispatchBuilder::DefaultSSEState() { const auto NumRegs = Is64BitMode ? 16U : 8U; Ref ZeroVector = LoadZeroVector(OpSize::i128Bit); for (uint32_t i = 0; i < NumRegs; ++i) { StoreXMMRegister(i, ZeroVector); } } void OpDispatchBuilder::DefaultAVXState() { const auto NumRegs = Is64BitMode ? 16U : 8U; for (uint32_t i = 0; i < NumRegs; i++) { Ref Reg = LoadXMMRegister(i); Ref Dst = _VMov(OpSize::i128Bit, Reg); StoreXMMRegister(i, Dst); } } Ref OpDispatchBuilder::PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm, bool IsAVX) { // For the 256-bit case we handle it as pairs of 128-bit halves. const auto DstSize = OpSizeFromDst(Op); const auto SanitizedDstSize = std::min(DstSize, OpSize::i128Bit); const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Index = Imm.Literal(); Ref Src2Node = LoadSourceFPR(Op, Src2, Op->Flags); if (Index == 0) { if (IsAVX && !Is256Bit) { // 128-bit AVX needs to zero the upper bits. return _VMov(OpSize::i128Bit, Src2Node); } else { return Src2Node; } } Ref Src1Node = LoadSourceFPR(Op, Src1, Op->Flags); if (Index >= (IR::OpSizeToSize(SanitizedDstSize) * 2)) { // If the immediate is greater than both vectors combined then it zeroes the vector return LoadZeroVector(DstSize); } Ref Low = _VExtr(SanitizedDstSize, OpSize::i8Bit, Src1Node, Src2Node, Index); if (!Is256Bit) { return Low; } Ref HighSrc1 = _VInsElement(DstSize, OpSize::i128Bit, 0, 1, Src1Node, Src1Node); Ref HighSrc2 = _VInsElement(DstSize, OpSize::i128Bit, 0, 1, Src2Node, Src2Node); Ref High = _VExtr(SanitizedDstSize, OpSize::i8Bit, HighSrc1, HighSrc2, Index); return _VInsElement(DstSize, OpSize::i128Bit, 1, 0, Low, High); } void OpDispatchBuilder::PAlignrOp(OpcodeArgs) { Ref Result = PALIGNROpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1], false); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPALIGNROp(OpcodeArgs) { Ref Result = PALIGNROpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2], true); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::UCOMISxOp(OpcodeArgs) { const auto SrcSize = Op->Src[0].IsGPR() ? GetGuestVectorLength() : ElementSize; Ref Src1 = LoadSourceFPR_WithOpSize(Op, Op->Dest, GetGuestVectorLength(), Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); Comiss(ElementSize, Src1, Src2); } template void OpDispatchBuilder::UCOMISxOp(OpcodeArgs); template void OpDispatchBuilder::UCOMISxOp(OpcodeArgs); void OpDispatchBuilder::LDMXCSR(OpcodeArgs) { Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, OpSize::i32Bit, Op->Flags); RestoreMXCSRState(Dest); } void OpDispatchBuilder::STMXCSR(OpcodeArgs) { StoreResultGPR_WithOpSize(Op, Op->Dest, GetMXCSR(), OpSize::i32Bit); } template void OpDispatchBuilder::PACKUSOp(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VSQXTUNPair(OpSizeFromSrc(Op), ElementSize, Dest, Src); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::PACKUSOp(OpcodeArgs); template void OpDispatchBuilder::PACKUSOp(OpcodeArgs); void OpDispatchBuilder::VPACKUSOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VSQXTUNPair(OpSizeFromSrc(Op), ElementSize, Src1, Src2); if (Is256Bit) { // We do a little cheeky 64-bit swapping to interleave the result. Ref Swapped = _VInsElement(DstSize, OpSize::i64Bit, 2, 1, Result, Result); Result = _VInsElement(DstSize, OpSize::i64Bit, 1, 2, Swapped, Result); } StoreResultFPR(Op, Result); } template void OpDispatchBuilder::PACKSSOp(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = _VSQXTNPair(OpSizeFromSrc(Op), ElementSize, Dest, Src); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::PACKSSOp(OpcodeArgs); template void OpDispatchBuilder::PACKSSOp(OpcodeArgs); void OpDispatchBuilder::VPACKSSOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = _VSQXTNPair(OpSizeFromSrc(Op), ElementSize, Src1, Src2); if (Is256Bit) { // We do a little cheeky 64-bit swapping to interleave the result. Ref Swapped = _VInsElement(DstSize, OpSize::i64Bit, 2, 1, Result, Result); Result = _VInsElement(DstSize, OpSize::i64Bit, 1, 2, Swapped, Result); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PMULLOpImpl(OpSize Size, IR::OpSize ElementSize, bool Signed, Ref Src1, Ref Src2) { if (Size == OpSize::i64Bit) { if (Signed) { return _VSMull(OpSize::i128Bit, ElementSize, Src1, Src2); } else { return _VUMull(OpSize::i128Bit, ElementSize, Src1, Src2); } } else { auto InsSrc1 = _VUnZip(Size, ElementSize, Src1, Src1); auto InsSrc2 = _VUnZip(Size, ElementSize, Src2, Src2); if (Signed) { return _VSMull(Size, ElementSize, InsSrc1, InsSrc2); } else { return _VUMull(Size, ElementSize, InsSrc1, InsSrc2); } } } template void OpDispatchBuilder::PMULLOp(OpcodeArgs) { static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit"); Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Res = PMULLOpImpl(OpSizeFromSrc(Op), ElementSize, Signed, Src1, Src2); StoreResultFPR(Op, Res); } template void OpDispatchBuilder::PMULLOp(OpcodeArgs); template void OpDispatchBuilder::PMULLOp(OpcodeArgs); template void OpDispatchBuilder::VPMULLOp(OpcodeArgs) { static_assert(ElementSize == OpSize::i32Bit, "Currently only handles 32-bit -> 64-bit"); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PMULLOpImpl(OpSizeFromSrc(Op), ElementSize, Signed, Src1, Src2); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VPMULLOp(OpcodeArgs); template void OpDispatchBuilder::VPMULLOp(OpcodeArgs); template void OpDispatchBuilder::MOVQ2DQ(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // This instruction is a bit special in that if the source is MMX then it zexts to 128bit if constexpr (ToXMM) { const auto Index = Op->Dest.Data.GPR.GPR - FEXCore::X86State::REG_XMM_0; Src = VZeroExtendOperand(OpSize::i128Bit, Op->Src[0], Src); StoreXMMRegister(Index, Src); } else { // This is simple, just store the result StoreResultFPR(Op, Src); } } template void OpDispatchBuilder::MOVQ2DQ(OpcodeArgs); template void OpDispatchBuilder::MOVQ2DQ(OpcodeArgs); Ref OpDispatchBuilder::ADDSUBPOpImpl(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2) { if (CTX->HostFeatures.SupportsFCMA) { if (ElementSize == OpSize::i32Bit) { auto Swizzle = _VRev64(Size, OpSize::i32Bit, Src2); return _VFCADD(Size, ElementSize, Src1, Swizzle, 90); } else { auto Swizzle = _VExtr(Size, OpSize::i8Bit, Src2, Src2, 8); return _VFCADD(Size, ElementSize, Src1, Swizzle, 90); } } else { auto ConstantEOR = LoadAndCacheNamedVectorConstant(Size, ElementSize == OpSize::i32Bit ? NAMED_VECTOR_PADDSUBPS_INVERT : NAMED_VECTOR_PADDSUBPD_INVERT); auto InvertedSource = _VXor(Size, ElementSize, Src2, ConstantEOR); return _VFAdd(Size, ElementSize, Src1, InvertedSource); } } template void OpDispatchBuilder::ADDSUBPOp(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = ADDSUBPOpImpl(OpSizeFromSrc(Op), ElementSize, Dest, Src); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::ADDSUBPOp(OpcodeArgs); template void OpDispatchBuilder::ADDSUBPOp(OpcodeArgs); template void OpDispatchBuilder::VADDSUBPOp(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = ADDSUBPOpImpl(OpSizeFromSrc(Op), ElementSize, Src1, Src2); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VADDSUBPOp(OpcodeArgs); template void OpDispatchBuilder::VADDSUBPOp(OpcodeArgs); void OpDispatchBuilder::PFNACCOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto DestUnzip = _VUnZip(Size, OpSize::i32Bit, Dest, Src); auto SrcUnzip = _VUnZip2(Size, OpSize::i32Bit, Dest, Src); auto Result = _VFSub(Size, OpSize::i32Bit, DestUnzip, SrcUnzip); StoreResultFPR(Op, Result); } void OpDispatchBuilder::PFPNACCOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref ResAdd {}; Ref ResSub {}; auto UpperSubDest = _VDupElement(Size, OpSize::i32Bit, Dest, 1); ResSub = _VFSub(OpSize::i32Bit, OpSize::i32Bit, Dest, UpperSubDest); ResAdd = _VFAddP(Size, OpSize::i32Bit, Src, Src); auto Result = _VInsElement(OpSize::i64Bit, OpSize::i32Bit, 1, 0, ResSub, ResAdd); StoreResultFPR(Op, Result); } void OpDispatchBuilder::PSWAPDOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto Result = _VRev64(Size, OpSize::i32Bit, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::PI2FWOp(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const auto Size = OpSizeFromDst(Op); // We now need to transpose the lower 16-bits of each element together // Only needing to move the upper element down in this case Src = _VUnZip(Size, OpSize::i16Bit, Src, Src); // Now we need to sign extend the 16bit value to 32-bit Src = _VSXTL(Size, OpSize::i16Bit, Src); // int32_t to float Src = _Vector_SToF(Size, OpSize::i32Bit, Src); StoreResultFPR_WithOpSize(Op, Op->Dest, Src, Size); } void OpDispatchBuilder::PF2IWOp(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const auto Size = OpSizeFromDst(Op); // Float to int32_t Src = _Vector_FToZS(Size, OpSize::i32Bit, Src); // We now need to transpose the lower 16-bits of each element together // Only needing to move the upper element down in this case Src = _VUnZip(Size, OpSize::i16Bit, Src, Src); // Now we need to sign extend the 16bit value to 32-bit Src = _VSXTL(Size, OpSize::i16Bit, Src); StoreResultFPR_WithOpSize(Op, Op->Dest, Src, Size); } void OpDispatchBuilder::PMULHRWOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Res {}; // Implementation is more efficient for 8byte registers // Multiplies 4 16bit values in to 4 32bit values Res = _VSMull(Size << 1, OpSize::i16Bit, Dest, Src); // Load 0x0000_8000 in to each 32-bit element. Ref VConstant = _VectorImm(OpSize::i128Bit, OpSize::i32Bit, 0x80, 8); Res = _VAdd(Size << 1, OpSize::i32Bit, Res, VConstant); // Now shift and narrow to convert 32-bit values to 16bit, storing the top 16bits Res = _VUShrNI(Size << 1, OpSize::i32Bit, Res, 16); StoreResultFPR(Op, Res); } template void OpDispatchBuilder::VPFCMPOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, OpSizeFromDst(Op), Op->Flags); Ref Result {}; // This maps 1:1 to an AArch64 NEON Op // auto ALUOp = _VCMPGT(Size, 4, Dest, Src); switch (CompType) { case 0x00: // EQ Result = _VFCMPEQ(Size, OpSize::i32Bit, Dest, Src); break; case 0x01: // GE(Swapped operand) Result = _VFCMPLE(Size, OpSize::i32Bit, Src, Dest); break; case 0x02: // GT Result = _VFCMPGT(Size, OpSize::i32Bit, Dest, Src); break; default: LOGMAN_MSG_A_FMT("Unknown Comparison type: {}", CompType); break; } StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VPFCMPOp<0>(OpcodeArgs); template void OpDispatchBuilder::VPFCMPOp<1>(OpcodeArgs); template void OpDispatchBuilder::VPFCMPOp<2>(OpcodeArgs); Ref OpDispatchBuilder::PMADDWDOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) { // This is a pretty curious operation // Does two MADD operations across 4 16bit signed integers and accumulates to 32bit integers in the destination // // x86 PMADDWD: xmm1, xmm2 // xmm1[31:0] = (xmm1[15:0] * xmm2[15:0]) + (xmm1[31:16] * xmm2[31:16]) // xmm1[63:32] = (xmm1[47:32] * xmm2[47:32]) + (xmm1[63:48] * xmm2[63:48]) // etc.. for larger registers if (Size == OpSize::i64Bit) { // MMX implementation can be slightly more optimal Size = Size >> 1; auto MullResult = _VSMull(Size, OpSize::i16Bit, Src1, Src2); return _VAddP(Size, OpSize::i32Bit, MullResult, MullResult); } auto Lower = _VSMull(Size, OpSize::i16Bit, Src1, Src2); auto Upper = _VSMull2(Size, OpSize::i16Bit, Src1, Src2); // [15:0 ] + [31:16], [32:47 ] + [63:48 ], [79:64] + [95:80], [111:96] + [127:112] return _VAddP(Size, OpSize::i32Bit, Lower, Upper); } void OpDispatchBuilder::PMADDWD(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PMADDWDOpImpl(Size, Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPMADDWDOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PMADDWDOpImpl(Size, Src1, Src2); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PMADDUBSWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) { if (Size == OpSize::i64Bit) { const auto MultSize = Size << 1; // 64bit is more efficient // Src1 is unsigned auto Src1_16b = _VUXTL(MultSize, OpSize::i8Bit, Src1); // [7:0 ], [15:8], [23:16], [31:24], [39:32], [47:40], [55:48], [63:56] // Src2 is signed auto Src2_16b = _VSXTL(MultSize, OpSize::i8Bit, Src2); // [7:0 ], [15:8], [23:16], [31:24], [39:32], [47:40], [55:48], [63:56] auto ResMul_L = _VSMull(MultSize, OpSize::i16Bit, Src1_16b, Src2_16b); auto ResMul_H = _VSMull2(MultSize, OpSize::i16Bit, Src1_16b, Src2_16b); // Now add pairwise across the vector auto ResAdd = _VAddP(MultSize, OpSize::i32Bit, ResMul_L, ResMul_H); // Add saturate back down to 16bit return _VSQXTN(MultSize, OpSize::i32Bit, ResAdd); } // V{U,S}XTL{,2}/ and VUnZip{,2} can be optimized in this solution to save about one instruction. // We can up-front zero extend and sign extend the elements in-place. // This means extracting even and odd elements up-front so the unzips aren't required. // Requires implementing IR ops for BIC (vector, immediate) although. // Src1 is unsigned auto Src1_16b_L = _VUXTL(Size, OpSize::i8Bit, Src1); // [7:0 ], [15:8], [23:16], [31:24], [39:32], [47:40], [55:48], [63:56] auto Src2_16b_L = _VSXTL(Size, OpSize::i8Bit, Src2); // [7:0 ], [15:8], [23:16], [31:24], [39:32], [47:40], [55:48], [63:56] auto ResMul_L = _VMul(Size, OpSize::i16Bit, Src1_16b_L, Src2_16b_L); // Src2 is signed auto Src1_16b_H = _VUXTL2(Size, OpSize::i8Bit, Src1); // Offset to +64bits [7:0 ], [15:8], [23:16], [31:24], [39:32], [47:40], [55:48], [63:56] auto Src2_16b_H = _VSXTL2(Size, OpSize::i8Bit, Src2); // Offset to +64bits [7:0 ], [15:8], [23:16], [31:24], [39:32], [47:40], [55:48], [63:56] auto ResMul_L_H = _VMul(Size, OpSize::i16Bit, Src1_16b_H, Src2_16b_H); auto TmpZip1 = _VUnZip(Size, OpSize::i16Bit, ResMul_L, ResMul_L_H); auto TmpZip2 = _VUnZip2(Size, OpSize::i16Bit, ResMul_L, ResMul_L_H); return _VSQAdd(Size, OpSize::i16Bit, TmpZip1, TmpZip2); } void OpDispatchBuilder::PMADDUBSW(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPMADDUBSWOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PMADDUBSWOpImpl(Size, Src1, Src2); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PMULHWOpImpl(OpcodeArgs, bool Signed, Ref Src1, Ref Src2) { const auto Size = OpSizeFromSrc(Op); if (Signed) { return _VSMulH(Size, OpSize::i16Bit, Src1, Src2); } else { return _VUMulH(Size, OpSize::i16Bit, Src1, Src2); } } template void OpDispatchBuilder::PMULHW(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PMULHWOpImpl(Op, Signed, Dest, Src); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::PMULHW(OpcodeArgs); template void OpDispatchBuilder::PMULHW(OpcodeArgs); template void OpDispatchBuilder::VPMULHWOp(OpcodeArgs) { const auto DstSize = GetDstSize(Op); const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE; Ref Dest = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PMULHWOpImpl(Op, Signed, Dest, Src); if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VPMULHWOp(OpcodeArgs); template void OpDispatchBuilder::VPMULHWOp(OpcodeArgs); Ref OpDispatchBuilder::PMULHRSWOpImpl(OpSize Size, Ref Src1, Ref Src2) { Ref Res {}; if (Size == OpSize::i64Bit) { // Implementation is more efficient for 8byte registers Res = _VSMull(Size << 1, OpSize::i16Bit, Src1, Src2); Res = _VSShrI(Size << 1, OpSize::i32Bit, Res, 14); auto OneVector = _VectorImm(Size << 1, OpSize::i32Bit, 1); Res = _VAdd(Size << 1, OpSize::i32Bit, Res, OneVector); return _VUShrNI(Size << 1, OpSize::i32Bit, Res, 1); } else { // 128-bit and 256-bit are less efficient Ref ResultLow; Ref ResultHigh; ResultLow = _VSMull(Size, OpSize::i16Bit, Src1, Src2); ResultHigh = _VSMull2(Size, OpSize::i16Bit, Src1, Src2); ResultLow = _VSShrI(Size, OpSize::i32Bit, ResultLow, 14); ResultHigh = _VSShrI(Size, OpSize::i32Bit, ResultHigh, 14); auto OneVector = _VectorImm(Size, OpSize::i32Bit, 1); ResultLow = _VAdd(Size, OpSize::i32Bit, ResultLow, OneVector); ResultHigh = _VAdd(Size, OpSize::i32Bit, ResultHigh, OneVector); // Combine the results Res = _VUShrNI(Size, OpSize::i32Bit, ResultLow, 1); return _VUShrNI2(Size, OpSize::i32Bit, Res, ResultHigh, 1); } } void OpDispatchBuilder::PMULHRSW(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PMULHRSWOpImpl(OpSizeFromSrc(Op), Dest, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPMULHRSWOp(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PMULHRSWOpImpl(OpSizeFromSrc(Op), Dest, Src); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::HSUBPOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Src1, Ref Src2) { auto Even = _VUnZip(SrcSize, ElementSize, Src1, Src2); auto Odd = _VUnZip2(SrcSize, ElementSize, Src1, Src2); return _VFSub(SrcSize, ElementSize, Even, Odd); } template void OpDispatchBuilder::HSUBP(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = HSUBPOpImpl(OpSizeFromSrc(Op), ElementSize, Src1, Src2); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::HSUBP(OpcodeArgs); template void OpDispatchBuilder::HSUBP(OpcodeArgs); void OpDispatchBuilder::VHSUBPOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = HSUBPOpImpl(OpSizeFromSrc(Op), ElementSize, Src1, Src2); Ref Dest = Result; if (Is256Bit) { Dest = _VInsElement(DstSize, OpSize::i64Bit, 1, 2, Result, Result); Dest = _VInsElement(DstSize, OpSize::i64Bit, 2, 1, Dest, Result); } StoreResultFPR(Op, Dest); } Ref OpDispatchBuilder::PHSUBOpImpl(OpSize Size, Ref Src1, Ref Src2, IR::OpSize ElementSize) { auto Even = _VUnZip(Size, ElementSize, Src1, Src2); auto Odd = _VUnZip2(Size, ElementSize, Src1, Src2); return _VSub(Size, ElementSize, Even, Odd); } template void OpDispatchBuilder::PHSUB(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PHSUBOpImpl(OpSizeFromSrc(Op), Src1, Src2, ElementSize); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::PHSUB(OpcodeArgs); template void OpDispatchBuilder::PHSUB(OpcodeArgs); void OpDispatchBuilder::VPHSUBOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PHSUBOpImpl(OpSizeFromSrc(Op), Src1, Src2, ElementSize); if (Is256Bit) { Ref Inserted = _VInsElement(DstSize, OpSize::i64Bit, 1, 2, Result, Result); Result = _VInsElement(DstSize, OpSize::i64Bit, 2, 1, Inserted, Result); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::PHADDSOpImpl(OpSize Size, Ref Src1, Ref Src2) { const auto ElementSize = OpSize::i16Bit; auto Even = _VUnZip(Size, ElementSize, Src1, Src2); auto Odd = _VUnZip2(Size, ElementSize, Src1, Src2); // Saturate back down to the result return _VSQAdd(Size, ElementSize, Even, Odd); } void OpDispatchBuilder::PHADDS(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PHADDSOpImpl(OpSizeFromSrc(Op), Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPHADDSWOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto Is256Bit = SrcSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PHADDSOpImpl(OpSizeFromSrc(Op), Src1, Src2); Ref Dest = Result; if (Is256Bit) { Dest = _VInsElement(SrcSize, OpSize::i64Bit, 1, 2, Result, Result); Dest = _VInsElement(SrcSize, OpSize::i64Bit, 2, 1, Dest, Result); } StoreResultFPR(Op, Dest); } Ref OpDispatchBuilder::PHSUBSOpImpl(OpSize Size, Ref Src1, Ref Src2) { const auto ElementSize = OpSize::i16Bit; auto Even = _VUnZip(Size, ElementSize, Src1, Src2); auto Odd = _VUnZip2(Size, ElementSize, Src1, Src2); // Saturate back down to the result return _VSQSub(Size, ElementSize, Even, Odd); } void OpDispatchBuilder::PHSUBS(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PHSUBSOpImpl(OpSizeFromSrc(Op), Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPHSUBSWOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PHSUBSOpImpl(OpSizeFromSrc(Op), Src1, Src2); Ref Dest = Result; if (Is256Bit) { Dest = _VInsElement(DstSize, OpSize::i64Bit, 1, 2, Result, Result); Dest = _VInsElement(DstSize, OpSize::i64Bit, 2, 1, Dest, Result); } StoreResultFPR(Op, Dest); } Ref OpDispatchBuilder::PSADBWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2) { // The documentation is actually incorrect in how this instruction operates // It strongly implies that the `abs(dest[i] - src[i])` operates in 8bit space // but it actually operates in more than 8bit space // This can be seen with `abs(0 - 0xFF)` returning a different result depending // on bit length const auto Is128Bit = Size == OpSize::i128Bit; if (Size == OpSize::i64Bit) { auto AbsResult = _VUABDL(Size << 1, OpSize::i8Bit, Src1, Src2); // Now vector-wide add the results for each return _VAddV(Size << 1, OpSize::i16Bit, AbsResult); } auto AbsResult_Low = _VUABDL(Size, OpSize::i8Bit, Src1, Src2); auto AbsResult_High = _VUABDL2(Size, OpSize::i8Bit, Src1, Src2); Ref Result_Low = _VAddV(OpSize::i128Bit, OpSize::i16Bit, AbsResult_Low); Ref Result_High = _VAddV(OpSize::i128Bit, OpSize::i16Bit, AbsResult_High); auto Low = _VZip(Size, OpSize::i64Bit, Result_Low, Result_High); if (Is128Bit) { return Low; } Ref HighSrc1 = _VDupElement(Size, OpSize::i128Bit, AbsResult_Low, 1); Ref HighSrc2 = _VDupElement(Size, OpSize::i128Bit, AbsResult_High, 1); Ref HighResult_Low = _VAddV(OpSize::i128Bit, OpSize::i16Bit, HighSrc1); Ref HighResult_High = _VAddV(OpSize::i128Bit, OpSize::i16Bit, HighSrc2); Ref High = _VInsElement(Size, OpSize::i64Bit, 1, 0, HighResult_Low, HighResult_High); Ref Full = _VInsElement(Size, OpSize::i128Bit, 1, 0, Low, High); Ref Tmp = _VInsElement(Size, OpSize::i64Bit, 2, 1, Full, Full); return _VInsElement(Size, OpSize::i64Bit, 1, 2, Tmp, Full); } void OpDispatchBuilder::PSADBW(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = PSADBWOpImpl(Size, Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPSADBWOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = PSADBWOpImpl(Size, Src1, Src2); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed) { const auto DstSize = OpSizeFromDst(Op); const auto GetSrc = [&] { if (Op->Src[0].IsGPR()) { return LoadSourceFPR_WithOpSize(Op, Op->Src[0], DstSize, Op->Flags); } else { // For memory operands the 256-bit variant loads twice the size specified in the table. const auto Is256Bit = DstSize == OpSize::i256Bit; const auto SrcSize = OpSizeFromSrc(Op); const auto LoadSize = Is256Bit ? IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) * 2) : SrcSize; return LoadSourceFPR_WithOpSize(Op, Op->Src[0], LoadSize, Op->Flags); } }; Ref Src = GetSrc(); Ref Result {Src}; for (auto CurrentElementSize = ElementSize; CurrentElementSize != DstElementSize; CurrentElementSize = CurrentElementSize << 1) { if (Signed) { Result = _VSXTL(DstSize, CurrentElementSize, Result); } else { Result = _VUXTL(DstSize, CurrentElementSize, Result); } } return Result; } template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs) { Ref Result = ExtendVectorElementsImpl(Op, ElementSize, DstElementSize, Signed); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); template void OpDispatchBuilder::ExtendVectorElements(OpcodeArgs); Ref OpDispatchBuilder::VectorRoundImpl(OpSize Size, IR::OpSize ElementSize, Ref Src, uint64_t Mode) { return _Vector_FToI(Size, ElementSize, Src, TranslateRoundType(Mode)); } template void OpDispatchBuilder::VectorRound(OpcodeArgs) { // No need to zero extend the vector in the event we have a // scalar source, especially since it's only inserted into another vector. const auto SrcSize = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); const uint64_t Mode = Op->Src[1].Literal(); Src = VectorRoundImpl(OpSizeFromDst(Op), ElementSize, Src, Mode); StoreResultFPR(Op, Src); } template void OpDispatchBuilder::VectorRound(OpcodeArgs); template void OpDispatchBuilder::VectorRound(OpcodeArgs); template void OpDispatchBuilder::AVXVectorRound(OpcodeArgs) { const auto Mode = Op->Src[1].Literal(); // No need to zero extend the vector in the event we have a // scalar source, especially since it's only inserted into another vector. const auto SrcSize = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); Ref Result = VectorRoundImpl(OpSizeFromDst(Op), ElementSize, Src, Mode); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::AVXVectorRound(OpcodeArgs); template void OpDispatchBuilder::AVXVectorRound(OpcodeArgs); Ref OpDispatchBuilder::VectorBlend(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t Selector) { if (ElementSize == OpSize::i32Bit) { Selector &= 0b1111; switch (Selector) { case 0b0000: // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src1[127:96] // Copy return Src1; case 0b0001: // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src1[127:96] return _VInsElement(Size, ElementSize, 0, 0, Src1, Src2); case 0b0010: // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src1[127:96] return _VInsElement(Size, ElementSize, 1, 1, Src1, Src2); case 0b0011: // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src1[127:96] return _VInsElement(Size, OpSize::i64Bit, 0, 0, Src1, Src2); case 0b0100: // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src1[127:96] return _VInsElement(Size, ElementSize, 2, 2, Src1, Src2); case 0b0101: { // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src1[127:96] // Rotate the elements of the incoming source so they end up in the correct location. // Then trn2 keeps the destination results in the expected location. auto Temp = _VRev64(Size, OpSize::i32Bit, Src2); return _VTrn2(Size, ElementSize, Temp, Src1); } case 0b0110: { // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src1[127:96] auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_BLENDPS_0110B); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } case 0b0111: { // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src1[127:96] auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_BLENDPS_0111B); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } case 0b1000: // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src2[127:96] return _VInsElement(Size, ElementSize, 3, 3, Src1, Src2); case 0b1001: { // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src2[127:96] auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_BLENDPS_1001B); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } case 0b1010: { // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src2[127:96] // Rotate the elements of the incoming destination so they end up in the correct location. // Then trn2 keeps the source results in the expected location. auto Temp = _VRev64(Size, OpSize::i32Bit, Src1); return _VTrn2(Size, ElementSize, Temp, Src2); } case 0b1011: { // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src1[95:64] // Dest[127:96] = Src2[127:96] auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_BLENDPS_1011B); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } case 0b1100: // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src2[127:96] return _VInsElement(Size, OpSize::i64Bit, 1, 1, Src1, Src2); case 0b1101: { // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src1[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src2[127:96] auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_BLENDPS_1101B); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } case 0b1110: { // Dest[31:0] = Src1[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src2[127:96] auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_BLENDPS_1110B); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } case 0b1111: // Dest[31:0] = Src2[31:0] // Dest[63:32] = Src2[63:32] // Dest[95:64] = Src2[95:64] // Dest[127:96] = Src2[127:96] // Copy return Src2; default: break; } } else if (ElementSize == OpSize::i64Bit) { Selector &= 0b11; switch (Selector) { case 0b00: // No-op return Src1; case 0b01: // Dest[63:0] = Src2[63:0] // Dest[127:64] = Src1[127:64] return _VInsElement(Size, ElementSize, 0, 0, Src1, Src2); case 0b10: // Dest[63:0] = Src1[63:0] // Dest[127:64] = Src2[127:64] return _VInsElement(Size, ElementSize, 1, 1, Src1, Src2); case 0b11: // Copy return Src2; } } else { ///< Zero instruction copies switch (Selector) { case 0b0000'0000: return Src1; case 0b1111'1111: return Src2; default: break; } ///< Single instruction implementation switch (Selector) { case 0b0000'0001: case 0b0000'0010: case 0b0000'0100: case 0b0000'1000: case 0b0001'0000: case 0b0010'0000: case 0b0100'0000: case 0b1000'0000: { // Single 16-bit element insert. const auto Element = FEXCore::ilog2(Selector); return _VInsElement(Size, ElementSize, Element, Element, Src1, Src2); } case 0b1111'1110: case 0b1111'1101: case 0b1111'1011: case 0b1111'0111: case 0b1110'1111: case 0b1101'1111: case 0b1011'1111: case 0b0111'1111: { // Single 16-bit element insert, inverted uint8_t SelectorInvert = ~Selector; const auto Element = FEXCore::ilog2(SelectorInvert); return _VInsElement(Size, ElementSize, Element, Element, Src2, Src1); } case 0b0000'0011: case 0b0000'1100: case 0b0011'0000: case 0b1100'0000: { // Single 32-bit element insert. const auto Element = std::countr_zero(Selector) / 2; return _VInsElement(Size, OpSize::i32Bit, Element, Element, Src1, Src2); } case 0b1111'1100: case 0b1111'0011: case 0b1100'1111: case 0b0011'1111: { // Single 32-bit element insert, inverted uint8_t SelectorInvert = ~Selector; const auto Element = std::countr_zero(SelectorInvert) / 2; return _VInsElement(Size, OpSize::i32Bit, Element, Element, Src2, Src1); } case 0b0000'1111: case 0b1111'0000: { // Single 64-bit element insert. const auto Element = std::countr_zero(Selector) / 4; return _VInsElement(Size, OpSize::i64Bit, Element, Element, Src1, Src2); } default: break; } ///< Two instruction implementation switch (Selector) { ///< Fancy double VExtr case 0b0'0'0'0'0'1'1'1: { auto Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src2, Src1, 6); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 10); } case 0b0'0'0'1'1'1'1'1: { auto Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src2, Src1, 10); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 6); } case 0b1'1'1'0'0'0'0'0: { auto Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src2, 10); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 6); } case 0b1'1'1'1'1'0'0'0: { auto Tmp = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src2, 6); return _VExtr(OpSize::i128Bit, OpSize::i8Bit, Tmp, Tmp, 10); } default: break; } // TODO: There are some of these swizzles that can be more optimal. // NamedConstant + VTBX1 is quite quick already. // Implement more if it becomes relevant. auto ConstantSwizzle = LoadAndCacheIndexedNamedVectorConstant(Size, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PBLENDW, Selector * 16); return _VTBX1(Size, Src1, Src2, ConstantSwizzle); } FEX_UNREACHABLE; } template void OpDispatchBuilder::VectorBlend(OpcodeArgs) { uint8_t Select = Op->Src[1].Literal(); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Dest = VectorBlend(OpSize::i128Bit, ElementSize, Dest, Src, Select); StoreResultFPR(Op, Dest); } template void OpDispatchBuilder::VectorBlend(OpcodeArgs); template void OpDispatchBuilder::VectorBlend(OpcodeArgs); template void OpDispatchBuilder::VectorBlend(OpcodeArgs); void OpDispatchBuilder::VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) { const auto Size = OpSizeFromSrc(Op); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto Mask = LoadXMMRegister(0); // Each element is selected by the high bit of that element size // Dest[ElementIdx] = Xmm0[ElementIndex][HighBit] ? Src : Dest; // // To emulate this on AArch64 // Arithmetic shift right by the element size, then use BSL to select the registers Mask = _VSShrI(Size, ElementSize, Mask, IR::OpSizeAsBits(ElementSize) - 1); auto Result = _VBSL(Size, Mask, Src, Dest); StoreResultFPR(Op, Result); } void OpDispatchBuilder::AVXVectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize) { const auto SrcSize = OpSizeFromSrc(Op); const auto ElementSizeBits = IR::OpSizeAsBits(ElementSize); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); // Mask register is encoded within bits [7:4] of the selector const auto Src3Selector = Op->Src[2].Literal(); Ref Mask = LoadXMMRegister((Src3Selector >> 4) & 0b1111); Ref Shifted = _VSShrI(SrcSize, ElementSize, Mask, ElementSizeBits - 1); Ref Result = _VBSL(SrcSize, Shifted, Src2, Src1); StoreResultFPR(Op, Result); } void OpDispatchBuilder::PTestOpImpl(OpSize Size, Ref Dest, Ref Src) { Ref Test1 = _VAnd(Size, OpSize::i8Bit, Dest, Src); Ref Test2 = _VAndn(Size, OpSize::i8Bit, Src, Dest); // Element size must be less than 32-bit for the sign bit tricks. Test1 = _VUMaxV(Size, OpSize::i16Bit, Test1); Test2 = _VUMaxV(Size, OpSize::i16Bit, Test2); Test1 = _VExtractToGPR(Size, OpSize::i16Bit, Test1, 0); Test2 = _VExtractToGPR(Size, OpSize::i16Bit, Test2, 0); Test2 = To01(OpSize::i64Bit, Test2); // Careful, these flags are different between {V,}PTEST and VTESTP{S,D} // Set ZF according to Test1. SF will be zeroed since we do a 32-bit test on // the results of a 16-bit value from the UMaxV, so the 32-bit sign bit is // cleared even if the 16-bit scalars were negative. SetNZ_ZeroCV(OpSize::i32Bit, Test1); SetCFInverted(Test2); ZeroPF_AF(); } void OpDispatchBuilder::PTestOp(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); PTestOpImpl(OpSizeFromSrc(Op), Dest, Src); } void OpDispatchBuilder::VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Src1, Ref Src2) { LOGMAN_THROW_A_FMT(ElementSize >= IR::OpSize::i8Bit && ElementSize <= IR::OpSize::i64Bit, "Invalid size"); const auto ElementSizeInBits = IR::OpSizeAsBits(ElementSize); const auto MaskConstant = uint64_t {1} << (ElementSizeInBits - 1); Ref Mask = _VDupFromGPR(SrcSize, ElementSize, Constant(MaskConstant)); Ref AndTest = _VAnd(SrcSize, OpSize::i8Bit, Src2, Src1); Ref AndNotTest = _VAndn(SrcSize, OpSize::i8Bit, Src2, Src1); Ref MaskedAnd = _VAnd(SrcSize, OpSize::i8Bit, AndTest, Mask); Ref MaskedAndNot = _VAnd(SrcSize, OpSize::i8Bit, AndNotTest, Mask); Ref MaxAnd = _VUMaxV(SrcSize, OpSize::i16Bit, MaskedAnd); Ref MaxAndNot = _VUMaxV(SrcSize, OpSize::i16Bit, MaskedAndNot); Ref AndGPR = _VExtractToGPR(SrcSize, OpSize::i16Bit, MaxAnd, 0); Ref AndNotGPR = _VExtractToGPR(SrcSize, OpSize::i16Bit, MaxAndNot, 0); Ref CFInv = To01(OpSize::i64Bit, AndNotGPR); // As in PTest, this sets Z appropriately while zeroing the rest of NZCV. SetNZ_ZeroCV(OpSize::i32Bit, AndGPR); SetCFInverted(CFInv); ZeroPF_AF(); } template void OpDispatchBuilder::VTESTPOp(OpcodeArgs) { Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); VTESTOpImpl(OpSizeFromSrc(Op), ElementSize, Src1, Src2); } template void OpDispatchBuilder::VTESTPOp(OpcodeArgs); template void OpDispatchBuilder::VTESTPOp(OpcodeArgs); Ref OpDispatchBuilder::PHMINPOSUWOpImpl(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); // Setup a vector swizzle // Initially load a 64-bit mask of immediates // Then zero-extend that to 128-bit mask with the immediates in the lower 16-bits of each element auto ConstantSwizzle = LoadAndCacheNamedVectorConstant(Size, FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_INCREMENTAL_U16_INDEX); // We now need to zip the vector sources together to become two uint32x4_t vectors // Upper: // [127:96]: ([127:112] << 16) | (7) // [95:64] : ([111:96] << 16) | (6) // [63:32] : ([95:80] << 16) | (5) // [31:0] : ([79:64] << 16) | (4) // Lower: // [127:96]: ([63:48] << 16) | (3) // [95:64] : ([47:32] << 16) | (2) // [63:32] : ([31:16] << 16) | (1) // [31:0] : ([15:0] << 16) | (0) auto ZipLower = _VZip(Size, OpSize::i16Bit, ConstantSwizzle, Src); auto ZipUpper = _VZip2(Size, OpSize::i16Bit, ConstantSwizzle, Src); // The elements are now 32-bit between two vectors. auto MinBetween = _VUMin(Size, OpSize::i32Bit, ZipLower, ZipUpper); // Now do a horizontal vector minimum auto Min = _VUMinV(Size, OpSize::i32Bit, MinBetween); // We now have a value in the bottom 32-bits in the order of: // [31:0]: (Src[] << 16) | // This instruction wants it in the form of: // [31:0]: ( << 16) | Src[] // Rev32 does this for us return _VRev32(Size, OpSize::i16Bit, Min); } void OpDispatchBuilder::PHMINPOSUWOp(OpcodeArgs) { Ref Result = PHMINPOSUWOpImpl(Op); StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t Mask, IR::OpSize ElementSize) { const auto SizeMask = [ElementSize]() { if (ElementSize == OpSize::i32Bit) { return 0b1111; } return 0b11; }(); const uint8_t SrcMask = (Mask >> 4) & SizeMask; const uint8_t DstMask = Mask & SizeMask; const auto NamedIndexMask = [ElementSize]() { if (ElementSize == OpSize::i32Bit) { return FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPS_MASK; } return FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPD_MASK; }(); Ref ZeroVec = LoadZeroVector(DstSize); if (SrcMask == 0 || DstMask == 0) { // What are you even doing here? Go away. return ZeroVec; } // First step is to do an FMUL Ref Temp = _VFMul(DstSize, ElementSize, Src1, Src2); // Now mask results based on IndexMask. if (SrcMask != SizeMask) { auto InputMask = LoadAndCacheIndexedNamedVectorConstant(DstSize, NamedIndexMask, SrcMask * 16); Temp = _VAnd(DstSize, ElementSize, Temp, InputMask); } // Now due a float reduction Temp = _VFAddV(DstSize, ElementSize, Temp); // Now using the destination mask we choose where the result ends up // It can duplicate and zero results if (ElementSize == OpSize::i64Bit) { switch (DstMask) { case 0b01: // Dest[63:0] = Result // Dest[127:64] = Zero return _VZip(DstSize, ElementSize, Temp, ZeroVec); case 0b10: // Dest[63:0] = Zero // Dest[127:64] = Result return _VZip(DstSize, ElementSize, ZeroVec, Temp); case 0b11: // Broadcast // Dest[63:0] = Result // Dest[127:64] = Result return _VDupElement(DstSize, ElementSize, Temp, 0); case 0: default: LOGMAN_MSG_A_FMT("Unsupported"); } } else { auto BadPath = [&]() { Ref Result = ZeroVec; for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) { const auto Bit = 1U << (i % 4); if ((DstMask & Bit) != 0) { Result = _VInsElement(DstSize, ElementSize, i, 0, Result, Temp); } } return Result; }; switch (DstMask) { case 0b0001: // Dest[31:0] = Result // Dest[63:32] = Zero // Dest[95:64] = Zero // Dest[127:96] = Zero return _VZip(DstSize, ElementSize, Temp, ZeroVec); case 0b0010: // Dest[31:0] = Zero // Dest[63:32] = Result // Dest[95:64] = Zero // Dest[127:96] = Zero return _VZip(DstSize >> 1, ElementSize, ZeroVec, Temp); case 0b0011: // Dest[31:0] = Result // Dest[63:32] = Result // Dest[95:64] = Zero // Dest[127:96] = Zero return _VDupElement(DstSize >> 1, ElementSize, Temp, 0); case 0b0100: // Dest[31:0] = Zero // Dest[63:32] = Zero // Dest[95:64] = Result // Dest[127:96] = Zero return _VZip(DstSize, OpSize::i64Bit, ZeroVec, Temp); case 0b0101: // Dest[31:0] = Result // Dest[63:32] = Zero // Dest[95:64] = Result // Dest[127:96] = Zero return _VZip(DstSize, OpSize::i64Bit, Temp, Temp); case 0b0110: // Dest[31:0] = Zero // Dest[63:32] = Result // Dest[95:64] = Result // Dest[127:96] = Zero return BadPath(); case 0b0111: // Dest[31:0] = Result // Dest[63:32] = Result // Dest[95:64] = Result // Dest[127:96] = Zero Temp = _VDupElement(DstSize, ElementSize, Temp, 0); return _VInsElement(DstSize, ElementSize, 3, 0, Temp, ZeroVec); case 0b1000: // Dest[31:0] = Zero // Dest[63:32] = Zero // Dest[95:64] = Zero // Dest[127:96] = Result return _VExtr(DstSize, OpSize::i8Bit, Temp, ZeroVec, 4); case 0b1001: // Dest[31:0] = Result // Dest[63:32] = Zero // Dest[95:64] = Zero // Dest[127:96] = Result return BadPath(); case 0b1010: // Dest[31:0] = Zero // Dest[63:32] = Result // Dest[95:64] = Zero // Dest[127:96] = Result Temp = _VDupElement(DstSize, ElementSize, Temp, 0); return _VZip(DstSize, OpSize::i32Bit, ZeroVec, Temp); case 0b1011: // Dest[31:0] = Result // Dest[63:32] = Result // Dest[95:64] = Zero // Dest[127:96] = Result Temp = _VDupElement(DstSize, ElementSize, Temp, 0); return _VInsElement(DstSize, ElementSize, 2, 0, Temp, ZeroVec); case 0b1100: // Dest[31:0] = Zero // Dest[63:32] = Zero // Dest[95:64] = Result // Dest[127:96] = Result Temp = _VDupElement(DstSize, ElementSize, Temp, 0); return _VZip(DstSize, OpSize::i64Bit, ZeroVec, Temp); case 0b1101: // Dest[31:0] = Result // Dest[63:32] = Zero // Dest[95:64] = Result // Dest[127:96] = Result Temp = _VDupElement(DstSize, ElementSize, Temp, 0); return _VInsElement(DstSize, ElementSize, 1, 0, Temp, ZeroVec); case 0b1110: // Dest[31:0] = Zero // Dest[63:32] = Result // Dest[95:64] = Result // Dest[127:96] = Result Temp = _VDupElement(DstSize, ElementSize, Temp, 0); return _VInsElement(DstSize, ElementSize, 0, 0, Temp, ZeroVec); case 0b1111: // Broadcast // Dest[31:0] = Result // Dest[63:32] = Zero // Dest[95:64] = Zero // Dest[127:96] = Zero return _VDupElement(DstSize, ElementSize, Temp, 0); case 0: default: LOGMAN_MSG_A_FMT("Unsupported"); } } FEX_UNREACHABLE; } template void OpDispatchBuilder::DPPOp(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = DPPOpImpl(OpSizeFromDst(Op), Dest, Src, Op->Src[1].Literal(), ElementSize); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::DPPOp(OpcodeArgs); template void OpDispatchBuilder::DPPOp(OpcodeArgs); Ref OpDispatchBuilder::VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm) { constexpr auto ElementSize = OpSize::i32Bit; const uint8_t Mask = Imm.Literal(); const uint8_t SrcMask = Mask >> 4; const uint8_t DstMask = Mask & 0xF; const auto DstSize = OpSizeFromDst(Op); Ref Src1V = LoadSourceFPR(Op, Src1, Op->Flags); Ref Src2V = LoadSourceFPR(Op, Src2, Op->Flags); Ref ZeroVec = LoadZeroVector(DstSize); // First step is to do an FMUL Ref Temp = _VFMul(DstSize, ElementSize, Src1V, Src2V); // Now we zero out elements based on src mask for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) { const auto Bit = 1U << (i % 4); if ((SrcMask & Bit) == 0) { Temp = _VInsElement(DstSize, ElementSize, i, 0, Temp, ZeroVec); } } // Now we need to do a horizontal add of the elements // We only have pairwise float add so this needs to be done in steps Temp = _VFAddP(DstSize, ElementSize, Temp, ZeroVec); if (ElementSize == OpSize::i32Bit) { // For 32-bit float we need one more step to add all four results together Temp = _VFAddP(DstSize, ElementSize, Temp, ZeroVec); } // Now using the destination mask we choose where the result ends up // It can duplicate and zero results Ref Result = ZeroVec; for (size_t i = 0; i < IR::NumElements(DstSize, ElementSize); ++i) { const auto Bit = 1U << (i % 4); if ((DstMask & Bit) != 0) { Result = _VInsElement(DstSize, ElementSize, i, 0, Result, Temp); } } return Result; } template void OpDispatchBuilder::VDPPOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Ref Result {}; if (ElementSize == OpSize::i32Bit && DstSize == OpSize::i256Bit) { // 256-bit DPPS isn't handled by the 128-bit solution. Result = VDPPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]); } else { Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Result = DPPOpImpl(DstSize, Src1, Src2, Op->Src[2].Literal(), ElementSize); } // We don't need to emit a _VMov to clear the upper lane, since DPPOpImpl uses a zero vector // to construct the results, so the upper lane will always be cleared for the 128-bit version. StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VDPPOp(OpcodeArgs); template void OpDispatchBuilder::VDPPOp(OpcodeArgs); Ref OpDispatchBuilder::MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select) { const auto LaneHelper = [&, this](uint32_t Selector_Src1, uint32_t Selector_Src2, Ref Src1, Ref Src2) { // Src2 will grab a 32bit element and duplicate it across the 128bits Ref DupSrc = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, Src2, Selector_Src2); // Src1/Dest needs a bunch of magic // Shift right by selected bytes // This will give us Dest[15:0], and Dest[79:64] Ref Dest1 = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src1, Selector_Src1 + 0); // This will give us Dest[31:16], and Dest[95:80] Ref Dest2 = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src1, Selector_Src1 + 1); // This will give us Dest[47:32], and Dest[111:96] Ref Dest3 = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src1, Selector_Src1 + 2); // This will give us Dest[63:48], and Dest[127:112] Ref Dest4 = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src1, Src1, Selector_Src1 + 3); // For each shifted section, we now have two 32-bit values per vector that can be used // Dest1.S[0] and Dest1.S[1] = Bytes - 0,1,2,3:4,5,6,7 // Dest2.S[0] and Dest2.S[1] = Bytes - 1,2,3,4:5,6,7,8 // Dest3.S[0] and Dest3.S[1] = Bytes - 2,3,4,5:6,7,8,9 // Dest4.S[0] and Dest4.S[1] = Bytes - 3,4,5,6:7,8,9,10 Dest1 = _VUABDL(OpSize::i128Bit, OpSize::i8Bit, Dest1, DupSrc); Dest2 = _VUABDL(OpSize::i128Bit, OpSize::i8Bit, Dest2, DupSrc); Dest3 = _VUABDL(OpSize::i128Bit, OpSize::i8Bit, Dest3, DupSrc); Dest4 = _VUABDL(OpSize::i128Bit, OpSize::i8Bit, Dest4, DupSrc); // Dest[1,2,3,4] Now contains the data prior to combining // Temp[0,1,2,3] for each step // Each destination now has 16bit x 8 elements in it that were the absolute difference for each byte // Needs each to be 16bit to store the next step // Next stage is to sum pairwise // Dest1: // ADDP Dest3, Dest1: TmpCombine1 // ADDP Dest4, Dest2: TmpCombine2 // TmpCombine1.8H[0] = Dest1.8H[0] + Dest1.8H[1]; // TmpCombine1.8H[1] = Dest1.8H[2] + Dest1.8H[3]; // TmpCombine1.8H[2] = Dest1.8H[4] + Dest1.8H[5]; // TmpCombine1.8H[3] = Dest1.8H[6] + Dest1.8H[7]; // TmpCombine1.8H[4] = Dest3.8H[0] + Dest3.8H[1]; // TmpCombine1.8H[5] = Dest3.8H[2] + Dest3.8H[3]; // TmpCombine1.8H[6] = Dest3.8H[4] + Dest3.8H[5]; // TmpCombine1.8H[7] = Dest3.8H[6] + Dest3.8H[7]; // auto TmpCombine1 = _VAddP(OpSize::i128Bit, OpSize::i16Bit, Dest1, Dest3); auto TmpCombine2 = _VAddP(OpSize::i128Bit, OpSize::i16Bit, Dest2, Dest4); // TmpTranspose1: // VTrn TmpCombine1, TmpCombine2: TmpTranspose1 // Transposes Even and odd elements so we can use vaddp for final results. auto TmpTranspose1 = _VTrn(OpSize::i128Bit, OpSize::i32Bit, TmpCombine1, TmpCombine2); auto TmpTranspose2 = _VTrn2(OpSize::i128Bit, OpSize::i32Bit, TmpCombine1, TmpCombine2); // ADDP TmpTranspose1, TmpTranspose2: FinalCombine // FinalCombine.8H[0] = TmpTranspose1.8H[0] + TmpTranspose1.8H[1] // FinalCombine.8H[1] = TmpTranspose1.8H[2] + TmpTranspose1.8H[3] // FinalCombine.8H[2] = TmpTranspose1.8H[4] + TmpTranspose1.8H[5] // FinalCombine.8H[3] = TmpTranspose1.8H[6] + TmpTranspose1.8H[7] // FinalCombine.8H[4] = TmpTranspose2.8H[0] + TmpTranspose2.8H[1] // FinalCombine.8H[5] = TmpTranspose2.8H[2] + TmpTranspose2.8H[3] // FinalCombine.8H[6] = TmpTranspose2.8H[4] + TmpTranspose2.8H[5] // FinalCombine.8H[7] = TmpTranspose2.8H[6] + TmpTranspose2.8H[7] return _VAddP(OpSize::i128Bit, OpSize::i16Bit, TmpTranspose1, TmpTranspose2); }; const auto Is128Bit = SrcSize == OpSize::i128Bit; // Src1 needs to be in byte offset const uint8_t Select_Src1_Low = ((Select & 0b100) >> 2) * 32 / 8; const uint8_t Select_Src2_Low = Select & 0b11; Ref Lower = LaneHelper(Select_Src1_Low, Select_Src2_Low, Src1, Src2); if (Is128Bit) { return Lower; } const uint8_t Select_Src1_High = ((Select & 0b100000) >> 5) * 32 / 8; const uint8_t Select_Src2_High = (Select & 0b11000) >> 3; Ref UpperSrc1 = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Src1, 1); Ref UpperSrc2 = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Src2, 1); Ref Upper = LaneHelper(Select_Src1_High, Select_Src2_High, UpperSrc1, UpperSrc2); return _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Lower, Upper); } void OpDispatchBuilder::MPSADBWOp(OpcodeArgs) { const uint8_t Select = Op->Src[1].Literal(); const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VMPSADBWOp(OpcodeArgs) { const uint8_t Select = Op->Src[2].Literal(); const auto SrcSize = OpSizeFromSrc(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = MPSADBWOpImpl(SrcSize, Src1, Src2, Select); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VINSERTOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], OpSize::i128Bit, Op->Flags); const auto Selector = Op->Src[2].Literal() & 1; Ref Result = _VInsElement(DstSize, OpSize::i128Bit, Selector, 0, Src1, Src2); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VCVTPH2PSOp(OpcodeArgs) { // In the event that a memory operand is used as the source operand, // the access width will always be half the size of the destination vector width // (i.e. 128-bit vector -> 64-bit mem, 256-bit vector -> 128-bit mem) const auto DstSize = OpSizeFromDst(Op); const auto SrcLoadSize = Op->Src[0].IsGPR() ? DstSize : IR::SizeToOpSize(IR::OpSizeToSize(DstSize) / 2); Ref Src = LoadSourceFPR_WithOpSize(Op, Op->Src[0], SrcLoadSize, Op->Flags); Ref Result = _Vector_FToF(DstSize, OpSize::i32Bit, Src, OpSize::i16Bit); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VCVTPS2PHOp(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto StoreSize = Op->Dest.IsGPR() ? OpSize::i128Bit : IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) / 2); const auto Imm8 = Op->Src[1].Literal(); const auto UseMXCSR = (Imm8 & 0b100) != 0; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = nullptr; if (UseMXCSR) { Result = _Vector_FToF(SrcSize, OpSize::i16Bit, Src, OpSize::i32Bit); } else { // No ARM float conversion instructions allow passing in // a rounding mode as an immediate. All of them depend on // the RM field in the FPCR. And so! We have to do some ugly // rounding mode shuffling. const auto NewRMode = Imm8 & 0b11; Ref SavedFPCR = _PushRoundingMode(NewRMode); Result = _Vector_FToF(SrcSize, OpSize::i16Bit, Src, OpSize::i32Bit); _PopRoundingMode(SavedFPCR); } // We need to eliminate upper junk if we're storing into a register with // a 256-bit source (VCVTPS2PH's destination for registers is an XMM). if (Op->Src[0].IsGPR() && SrcSize == OpSize::i256Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR_WithOpSize(Op, Op->Dest, Result, StoreSize); } void OpDispatchBuilder::VPERM2Op(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); const auto Selector = Op->Src[2].Literal(); Ref Result = LoadZeroVector(DstSize); const auto SelectElement = [&](uint64_t Index, uint64_t SelectorIdx) { switch (SelectorIdx) { case 0: case 1: return _VInsElement(DstSize, OpSize::i128Bit, Index, SelectorIdx, Result, Src1); case 2: case 3: default: return _VInsElement(DstSize, OpSize::i128Bit, Index, SelectorIdx - 2, Result, Src2); } }; if ((Selector & 0b00001000) == 0) { Result = SelectElement(0, Selector & 0b11); } if ((Selector & 0b10000000) == 0) { Result = SelectElement(1, (Selector >> 4) & 0b11); } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::VPERMDIndices(OpSize DstSize, Ref Indices, Ref IndexMask, Ref Repeating3210) { // Get rid of any junk unrelated to the relevant selector index bits (bits [2:0]) Ref SanitizedIndices = _VAnd(DstSize, OpSize::i8Bit, Indices, IndexMask); // Build up the broadcasted index mask. e.g. On x86-64, the selector index // is always in the lower 3 bits of a 32-bit element. However, in order to // build up a vector we can use with the ARMv8 TBL instruction, we need the // selector index for each particular element to be within each byte of the // 32-bit element. // // We can do this by TRN-ing the selector index vector twice. Once using byte elements // then once more using half-word elements. // // The first pass creates the half-word elements, and then the second pass uses those // halfword elements to place the indices in the top part of the 32-bit element. // // e.g. Consider a selector vector with indices in 32-bit elements like: // // ╔═══════════╗╔═══════════╗╔═══════════╗╔═══════════╗╔═══════════╗╔═══════════╗╔═══════════╗╔═══════════╗ // ║ 4 ║║ 1 ║║ 2 ║║ 6 ║║ 7 ║║ 0 ║║ 3 ║║ 5 ║ // ╚═══════════╝╚═══════════╝╚═══════════╝╚═══════════╝╚═══════════╝╚═══════════╝╚═══════════╝╚═══════════╝ // // TRNing once using byte elements by itself will create a vector with 8-bit elements like: // ╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗ // ║ 0 ║║ 0 ║║ 4 ║║ 4 ║║ 0 ║║ 0 ║║ 1 ║║ 1 ║║ 0 ║║ 0 ║║ 2 ║║ 2 ║║ 0 ║║ 0 ║║ 6 ║║ 6 ║║ 0 ║║ 0 ║║ 7 ║║ 7 ║║ 0 ║║ 0 ║║ 0 ║║ 0 ║║ 0 ║║ 0 ║║ 3 ║║ 3 ║║ 0 ║║ 0 ║║ 5 ║║ 5 ║ // ╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝ // // TRNing once using half-word elements by itself will then transform the vector into: // ╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗╔═══╗ // ║ 4 ║║ 4 ║║ 4 ║║ 4 ║║ 1 ║║ 1 ║║ 1 ║║ 1 ║║ 2 ║║ 2 ║║ 2 ║║ 2 ║║ 6 ║║ 6 ║║ 6 ║║ 6 ║║ 7 ║║ 7 ║║ 7 ║║ 7 ║║ 0 ║║ 0 ║║ 0 ║║ 0 ║║ 3 ║║ 3 ║║ 3 ║║ 3 ║║ 5 ║║ 5 ║║ 5 ║║ 5 ║ // ╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝╚═══╝ // // Cool! We now have everything we need to take this further. Ref IndexTrn1 = _VTrn(DstSize, OpSize::i8Bit, SanitizedIndices, SanitizedIndices); Ref IndexTrn2 = _VTrn(DstSize, OpSize::i16Bit, IndexTrn1, IndexTrn1); // Now that we have the indices set up, now we need to multiply each // element by 4 to convert the elements into byte indices rather than // 32-bit word indices. // // e.g. We turn our vector into: // ╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗ // ║ 16 ║║ 16 ║║ 16 ║║ 16 ║║ 4 ║║ 4 ║║ 4 ║║ 4 ║║ 8 ║║ 8 ║║ 8 ║║ 8 ║║ 24 ║║ 24 ║║ 24 ║║ 24 ║║ 28 ║║ 28 ║║ 28 ║║ 28 ║║ 0 ║║ 0 ║║ 00 ║║ 0 ║║ 12 ║║ 12 ║║ 12 ║║ 12 ║║ 20 ║║ 20 ║║ 20 ║║ 20 ║ // ╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝ // Ref ShiftedIndices = _VShlI(DstSize, OpSize::i8Bit, IndexTrn2, 2); // Now we need to add a byte vector containing [3, 2, 1, 0] repeating for the // entire length of it, to the index register, so that we specify the bytes // that make up the entire word in the source register. // // e.g. Our vector finally looks like so: // // ╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗╔════╗ // ║ 19 ║║ 18 ║║ 17 ║║ 16 ║║ 7 ║║ 6 ║║ 5 ║║ 4 ║║ 11 ║║ 10 ║║ 9 ║║ 8 ║║ 27 ║║ 26 ║║ 25 ║║ 24 ║║ 31 ║║ 30 ║║ 29 ║║ 28 ║║ 3 ║║ 2 ║║ 01 ║║ 0 ║║ 15 ║║ 14 ║║ 13 ║║ 12 ║║ 23 ║║ 22 ║║ 21 ║║ 20 ║ // ╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝╚════╝ // // Which finally lets us permute the source vector and be done with everything. return _VAdd(DstSize, OpSize::i8Bit, ShiftedIndices, Repeating3210); } void OpDispatchBuilder::VPERMDOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Ref Indices = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[1], Op->Flags); // Get rid of any junk unrelated to the relevant selector index bits (bits [2:0]) Ref IndexMask = _VectorImm(DstSize, OpSize::i32Bit, 0b111); Ref AddConst = Constant(0x03020100); Ref Repeating3210 = _VDupFromGPR(DstSize, OpSize::i32Bit, AddConst); Ref FinalIndices = VPERMDIndices(OpSizeFromDst(Op), Indices, IndexMask, Repeating3210); // Now lets finally shuffle this bad boy around. Ref Result = _VTBL1(DstSize, Src, FinalIndices); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPERMQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const auto Selector = Op->Src[1].Literal(); Ref Result {}; // If we're just broadcasting one element in particular across the vector // then this can be done fairly simply without any individual inserts. if (Selector == 0x00 || Selector == 0x55 || Selector == 0xAA || Selector == 0xFF) { const auto Index = Selector & 0b11; Result = _VDupElement(DstSize, OpSize::i64Bit, Src, Index); } else { Result = LoadZeroVector(DstSize); for (size_t i = 0; i < IR::NumElements(DstSize, IR::OpSize::i64Bit); i++) { const auto SrcIndex = (Selector >> (i * 2)) & 0b11; Result = _VInsElement(DstSize, OpSize::i64Bit, i, SrcIndex, Result, Src); } } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::VBLENDOpImpl(IR::OpSize VecSize, IR::OpSize ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector) { const std::array Sources {Src1, Src2}; Ref Result = ZeroRegister; const auto NumElements = IR::NumElements(VecSize, ElementSize); for (int i = 0; i < NumElements; i++) { const auto SelectorIndex = (Selector >> i) & 1; Result = _VInsElement(VecSize, ElementSize, i, i, Result, Sources[SelectorIndex]); } return Result; } void OpDispatchBuilder::VBLENDPDOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Selector = Op->Src[2].Literal(); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); if (Selector == 0) { Ref Result = Is256Bit ? Src1 : _VMov(OpSize::i128Bit, Src1); StoreResultFPR(Op, Result); return; } // Only the first four bits of the 8-bit immediate are used, so only check them. if (((Selector & 0b11) == 0b11 && !Is256Bit) || (Selector & 0b1111) == 0b1111) { Ref Result = Is256Bit ? Src2 : _VMov(OpSize::i128Bit, Src2); StoreResultFPR(Op, Result); return; } const auto ZeroRegister = LoadZeroVector(DstSize); Ref Result = VBLENDOpImpl(DstSize, OpSize::i64Bit, Src1, Src2, ZeroRegister, Selector); StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPBLENDDOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Selector = Op->Src[2].Literal(); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); // Each bit in the selector chooses between Src1 and Src2. // If a bit is set, then we select it's corresponding 32-bit element from Src2 // If a bit is not set, then we select it's corresponding 32-bit element from Src1 // Cases where we can exit out early, since the selector is indicating a copy // of an entire input vector. Unlikely to occur, since it's slower than // just an equivalent vector move instruction. but just in case something // silly is happening, we have your back. if (Selector == 0) { Ref Result = Is256Bit ? Src1 : _VMov(OpSize::i128Bit, Src1); StoreResultFPR(Op, Result); return; } if (Selector == 0xFF && Is256Bit) { StoreResultFPR(Op, Src2); return; } // The only bits we care about from the 8-bit immediate for 128-bit operations // are the first four bits. We do a bitwise check here to catch cases where // silliness is going on and the upper bits are being set even when they'll // be ignored if ((Selector & 0xF) == 0xF && !Is256Bit) { StoreResultFPR(Op, _VMov(OpSize::i128Bit, Src2)); return; } const auto ZeroRegister = LoadZeroVector(DstSize); Ref Result = VBLENDOpImpl(DstSize, OpSize::i32Bit, Src1, Src2, ZeroRegister, Selector); if (!Is256Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VPBLENDWOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto Is128Bit = DstSize == OpSize::i128Bit; const auto Selector = Op->Src[2].Literal(); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); if (Selector == 0) { Ref Result = Is128Bit ? _VMov(OpSize::i128Bit, Src1) : Src1; StoreResultFPR(Op, Result); return; } if (Selector == 0xFF) { Ref Result = Is128Bit ? _VMov(OpSize::i128Bit, Src2) : Src2; StoreResultFPR(Op, Result); return; } // 256-bit VPBLENDW acts as if the 8-bit selector values were also applied // to the upper bits, so we can just replicate the bits by forming a 16-bit // imm for the helper function to use. const auto NewSelector = Selector << 8 | Selector; const auto ZeroRegister = LoadZeroVector(DstSize); Ref Result = VBLENDOpImpl(DstSize, OpSize::i16Bit, Src1, Src2, ZeroRegister, NewSelector); if (Is128Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VZEROOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto IsVZEROALL = DstSize == OpSize::i256Bit; const auto NumRegs = Is64BitMode ? 16U : 8U; if (IsVZEROALL) { // NOTE: Despite the name being VZEROALL, this will still only ever // zero out up to the first 16 registers (even on AVX-512, where we have 32 registers) for (uint32_t i = 0; i < NumRegs; i++) { // Explicitly not caching named vector zero. This ensures that every register gets movi #0.0 directly. Ref ZeroVector = LoadUncachedZeroVector(DstSize); StoreXMMRegister(i, ZeroVector); } } else { // Likewise, VZEROUPPER will only ever zero only up to the first 16 registers for (uint32_t i = 0; i < NumRegs; i++) { Ref Reg = LoadXMMRegister(i); Ref Dst = _VMov(OpSize::i128Bit, Reg); StoreXMMRegister(i, Dst); } } } void OpDispatchBuilder::VPERMILImmOp(OpcodeArgs, IR::OpSize ElementSize) { const auto DstSize = OpSizeFromDst(Op); const auto Is256Bit = DstSize == OpSize::i256Bit; const auto Selector = Op->Src[1].Literal() & 0xFF; Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Result = LoadZeroVector(DstSize); if (ElementSize == OpSize::i64Bit) { Result = _VInsElement(DstSize, ElementSize, 0, Selector & 0b0001, Result, Src); Result = _VInsElement(DstSize, ElementSize, 1, (Selector & 0b0010) >> 1, Result, Src); if (Is256Bit) { Result = _VInsElement(DstSize, ElementSize, 2, ((Selector & 0b0100) >> 2) + 2, Result, Src); Result = _VInsElement(DstSize, ElementSize, 3, ((Selector & 0b1000) >> 3) + 2, Result, Src); } } else { Result = _VInsElement(DstSize, ElementSize, 0, Selector & 0b00000011, Result, Src); Result = _VInsElement(DstSize, ElementSize, 1, (Selector & 0b00001100) >> 2, Result, Src); Result = _VInsElement(DstSize, ElementSize, 2, (Selector & 0b00110000) >> 4, Result, Src); Result = _VInsElement(DstSize, ElementSize, 3, (Selector & 0b11000000) >> 6, Result, Src); if (Is256Bit) { Result = _VInsElement(DstSize, ElementSize, 4, (Selector & 0b00000011) + 4, Result, Src); Result = _VInsElement(DstSize, ElementSize, 5, ((Selector & 0b00001100) >> 2) + 4, Result, Src); Result = _VInsElement(DstSize, ElementSize, 6, ((Selector & 0b00110000) >> 4) + 4, Result, Src); Result = _VInsElement(DstSize, ElementSize, 7, ((Selector & 0b11000000) >> 6) + 4, Result, Src); } } StoreResultFPR(Op, Result); } Ref OpDispatchBuilder::VPERMILRegOpImpl(OpSize DstSize, IR::OpSize ElementSize, Ref Src, Ref Indices) { // NOTE: See implementation of VPERMD for the gist of what we do to make this work. // // The only difference here is that we need to add 16 to the upper lane // before doing the final addition to build up the indices for TBL. const auto Is256Bit = DstSize == OpSize::i256Bit; auto IsPD = ElementSize == OpSize::i64Bit; if (IsPD) { // VPERMILPD stores the selector in the second bit, rather than the // first bit of each element in the index vector. So move it over by one. Indices = _VUShrI(DstSize, ElementSize, Indices, 1); } // Sanitize indices first const auto ShiftAmount = 0b11 >> static_cast(IsPD); Ref IndexMask = _VectorImm(DstSize, ElementSize, ShiftAmount); Ref SanitizedIndices = _VAnd(DstSize, OpSize::i8Bit, Indices, IndexMask); Ref IndexTrn1 = _VTrn(DstSize, OpSize::i8Bit, SanitizedIndices, SanitizedIndices); Ref IndexTrn2 = _VTrn(DstSize, OpSize::i16Bit, IndexTrn1, IndexTrn1); Ref IndexTrn3 = IndexTrn2; if (IsPD) { IndexTrn3 = _VTrn(DstSize, OpSize::i32Bit, IndexTrn2, IndexTrn2); } auto IndexShift = IsPD ? 3 : 2; Ref ShiftedIndices = _VShlI(DstSize, OpSize::i8Bit, IndexTrn3, IndexShift); uint64_t VConstant = IsPD ? 0x0706050403020100 : 0x03020100; Ref VectorConst = _VDupFromGPR(DstSize, ElementSize, Constant(VConstant)); Ref FinalIndices {}; if (Is256Bit) { const auto ZeroRegister = LoadZeroVector(DstSize); Ref Vector16 = _VInsElement(DstSize, OpSize::i128Bit, 1, 0, ZeroRegister, _VectorImm(DstSize, OpSize::i8Bit, 16)); Ref IndexOffsets = _VAdd(DstSize, OpSize::i8Bit, VectorConst, Vector16); FinalIndices = _VAdd(DstSize, OpSize::i8Bit, IndexOffsets, ShiftedIndices); } else { FinalIndices = _VAdd(DstSize, OpSize::i8Bit, VectorConst, ShiftedIndices); } return _VTBL1(DstSize, Src, FinalIndices); } template void OpDispatchBuilder::VPERMILRegOp(OpcodeArgs) { Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Indices = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result = VPERMILRegOpImpl(OpSizeFromDst(Op), ElementSize, Src, Indices); StoreResultFPR(Op, Result); } template void OpDispatchBuilder::VPERMILRegOp(OpcodeArgs); template void OpDispatchBuilder::VPERMILRegOp(OpcodeArgs); void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask) { const uint16_t Control = Op->Src[1].Literal(); // NOTE: Unlike most other SSE/AVX instructions, the SSE4.2 string and text // instructions do *not* require memory operands to be aligned on a 16 byte // boundary (see "Other Exceptions" descriptions for the relevant // instructions in the Intel Software Development Manual). // // So, we specify Src2 as having an alignment of 1 to indicate this. Ref Src1 = LoadSourceFPR_WithOpSize(Op, Op->Dest, OpSize::i128Bit, Op->Flags); Ref Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[0], OpSize::i128Bit, Op->Flags, {.Align = OpSize::i8Bit}); Ref IntermediateResult {}; if (IsExplicit) { // Will be 4 in the absence of a REX.W bit and 8 in the presence of a REX.W bit. // // While the control bit immediate for the instruction itself is only ever 8 bits // in size, we use it as a 16-bit value so that we can use the 8th bit to signify // whether or not RAX and RDX should be interpreted as a 64-bit value. const auto SrcSize = OpSizeFromSrc(Op); const auto Is64Bit = SrcSize == OpSize::i64Bit; const auto NewControl = uint16_t(Control | (uint16_t(Is64Bit) << 8)); Ref SrcRAX = LoadGPRRegister(X86State::REG_RAX); Ref SrcRDX = LoadGPRRegister(X86State::REG_RDX); IntermediateResult = _VPCMPESTRX(Src1, Src2, SrcRAX, SrcRDX, NewControl); } else { IntermediateResult = _VPCMPISTRX(Src1, Src2, Control); } Ref ZeroConst = Constant(0); if (IsMask) { // For the masked variant of the instructions, if control[6] is set, then we // need to expand the intermediate result into a byte or word mask (depending // on data size specified in control[1]) along the entire length of XMM0, // where set bits in the intermediate result set the corresponding entry // in XMM0 to all 1s and unset bits set the corresponding entry to all 0s. // // If control[6] is not set, then we just store the intermediate result as-is // into the least significant bits of XMM0 and zero extend it. const auto IsExpandedMask = (Control & 0b0100'0000) != 0; if (IsExpandedMask) { // We need to iterate over the intermediate result and // expand the mask into XMM0 elements. const auto ElementSize = 1U << (Control & 1); const auto NumElements = 16U >> (Control & 1); Ref Result = LoadZeroVector(OpSize::i128Bit); for (uint32_t i = 0; i < NumElements; i++) { Ref SignBit = _Sbfe(OpSize::i64Bit, 1, i, IntermediateResult); Result = _VInsGPR(OpSize::i128Bit, IR::SizeToOpSize(ElementSize), i, Result, SignBit); } StoreXMMRegister(0, Result); } else { // We insert the intermediate result as-is. StoreXMMRegister(0, _VCastFromGPR(OpSize::i128Bit, OpSize::i16Bit, IntermediateResult)); } } else { // For the indexed variant of the instructions, if control[6] is set, then we // store the index of the most significant bit into ECX. If it's not set, // then we store the least significant bit. const auto UseMSBIndex = (Control & 0b0100'0000) != 0; Ref ResultNoFlags = _Bfe(OpSize::i32Bit, 16, 0, IntermediateResult); Ref IfZero = Constant(16 >> (Control & 1)); Ref IfNotZero = UseMSBIndex ? _FindMSB(IR::OpSize::i32Bit, ResultNoFlags) : _FindLSB(IR::OpSize::i32Bit, ResultNoFlags); Ref Result = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::EQ, ResultNoFlags, ZeroConst, IfZero, IfNotZero); // Store the result, it is already zero-extended to 64-bit implicitly. StoreGPRRegister(X86State::REG_RCX, Result); } // Set all of the necessary flags. NZCV stored in bits 28...31 like the hw op. SetNZCV(IntermediateResult); CFInverted = false; ZeroPF_AF(); } void OpDispatchBuilder::VPCMPESTRIOp(OpcodeArgs) { PCMPXSTRXOpImpl(Op, true, false); } void OpDispatchBuilder::VPCMPESTRMOp(OpcodeArgs) { PCMPXSTRXOpImpl(Op, true, true); } void OpDispatchBuilder::VPCMPISTRIOp(OpcodeArgs) { PCMPXSTRXOpImpl(Op, false, false); } void OpDispatchBuilder::VPCMPISTRMOp(OpcodeArgs) { PCMPXSTRXOpImpl(Op, false, true); } void OpDispatchBuilder::VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const auto Size = OpSizeFromDst(Op); const auto Is256Bit = Size == OpSize::i256Bit; const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; Ref Dest = LoadSourceFPR_WithOpSize(Op, Op->Dest, Size, Op->Flags); Ref Src1 = LoadSourceFPR_WithOpSize(Op, Op->Src[0], Size, Op->Flags); Ref Src2 {}; if (Op->Src[1].IsGPR()) { Src2 = LoadSourceFPR_WithOpSize(Op, Op->Src[1], Size, Op->Flags); } else { Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); } Ref Sources[3] = { Dest, Src1, Src2, }; DeriveOp(FMAResult, IROp, _VFMLA(Size, ElementSize, Sources[Src1Idx - 1], Sources[Src2Idx - 1], Sources[AddendIdx - 1])); Ref Result = FMAResult; if (Scalar) { // Special case, scalar inserts in to the low bits of the destination. Result = _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Dest, Result); } if (!Is256Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } void OpDispatchBuilder::VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) { const auto Size = OpSizeFromDst(Op); const auto Is256Bit = Size == OpSize::i256Bit; const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src1 = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Ref Src2 = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Sources[3] = { Dest, Src1, Src2, }; Ref ConstantEOR {}; if (AddSub) { ConstantEOR = LoadAndCacheNamedVectorConstant(Size, ElementSize == OpSize::i32Bit ? NAMED_VECTOR_PADDSUBPS_INVERT : NAMED_VECTOR_PADDSUBPD_INVERT); } else { ConstantEOR = LoadAndCacheNamedVectorConstant(Size, ElementSize == OpSize::i32Bit ? NAMED_VECTOR_PSUBADDPS_INVERT : NAMED_VECTOR_PSUBADDPD_INVERT); } auto InvertedSourc = _VXor(Size, ElementSize, Sources[AddendIdx - 1], ConstantEOR); Ref Result = _VFMLA(Size, ElementSize, Sources[Src1Idx - 1], Sources[Src2Idx - 1], InvertedSourc); if (!Is256Bit) { Result = _VMov(OpSize::i128Bit, Result); } StoreResultFPR(Op, Result); } OpDispatchBuilder::RefVSIB OpDispatchBuilder::LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags) { const bool IsVSIB = (Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0; LOGMAN_THROW_A_FMT((Operand.IsSIB() || Operand.IsSIBRelocation()) && IsVSIB, "Trying to load VSIB for something that isn't the correct " "type!"); // VSIB is a very special case which has a ton of encoded data. // Get it in a format we can reason about. const auto Index_gpr = Operand.Data.SIB.Index; const auto Base_gpr = Operand.Data.SIB.Base; LOGMAN_THROW_A_FMT(Index_gpr >= FEXCore::X86State::REG_XMM_0 && Index_gpr <= FEXCore::X86State::REG_XMM_15, "must be AVX reg"); LOGMAN_THROW_A_FMT(Base_gpr == FEXCore::X86State::REG_INVALID || (Base_gpr >= FEXCore::X86State::REG_RAX && Base_gpr <= FEXCore::X86State::REG_R15), "Base must be a GPR."); const auto Index_XMM_gpr = Index_gpr - X86State::REG_XMM_0; OpDispatchBuilder::RefVSIB A { .Low = LoadXMMRegister(Index_XMM_gpr), .BaseAddr = Base_gpr != FEXCore::X86State::REG_INVALID ? LoadGPRRegister(Base_gpr, OpSize::i64Bit, 0, false) : nullptr, .Scale = Operand.Data.SIB.Scale, }; if (Operand.IsSIBRelocation()) { auto EPOffset = _EntrypointOffset(OpSize::i64Bit, Operand.Data.SIB.Offset); if (A.BaseAddr) { A.BaseAddr = Add(OpSize::i64Bit, EPOffset, A.BaseAddr); } else { A.BaseAddr = EPOffset; } } else { A.Displacement = static_cast(Operand.Data.SIB.Offset); } return A; } template void OpDispatchBuilder::VPGATHER(OpcodeArgs) { LOGMAN_THROW_A_FMT(AddrElementSize == OpSize::i32Bit || AddrElementSize == OpSize::i64Bit, "Unknown address element size"); const auto Size = OpSizeFromDst(Op); const auto Is128Bit = Size == OpSize::i128Bit; const auto GPRSize = GetGPROpSize(); auto AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize; ///< Element size is determined by W flag. const OpSize ElementLoadSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit; // We only need the high address register if the number of data elements is more than what the low half can consume. // But also the number of address elements is clamped by the destination size as well. const size_t NumDataElements = IR::NumElements(Size, ElementLoadSize); const size_t NumAddrElementBytes = std::min(IR::OpSizeToSize(Size), (NumDataElements * IR::OpSizeToSize(AddrElementSize))); const bool Needs128BitHighAddrBytes = NumAddrElementBytes > IR::OpSizeToSize(OpSize::i128Bit); auto VSIB = LoadVSIB(Op, Op->Src[0], Op->Flags); const bool SupportsSVELoad = (VSIB.Scale == 1 || VSIB.Scale == IR::OpSizeToSize(AddrElementSize)) && (AddrElementSize == ElementLoadSize); Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Mask = LoadSourceFPR(Op, Op->Src[1], Op->Flags); Ref Result {}; if (!SupportsSVELoad) { // We need to go down the fallback path in the case that we don't hit the backend's SVE mode. RefPair Dest128 { .Low = Dest, .High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Dest, 1), }; RefPair Mask128 { .Low = Mask, .High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, Mask, 1), }; RefVSIB VSIB128 = VSIB; VSIB128.High = Invalid(); if (Needs128BitHighAddrBytes) { if (Is128Bit) { ///< A bit careful for the VSIB index register duplicating. VSIB128.High = VSIB128.Low; } else { VSIB128.High = _VDupElement(OpSize::i256Bit, OpSize::i128Bit, VSIB128.Low, 1); } } auto Result128 = AVX128_VPGatherImpl(Op, Size, ElementLoadSize, AddrElementSize, Dest128, Mask128, VSIB128); // The registers are current split, need to merge them. Result = _VInsElement(OpSize::i256Bit, OpSize::i128Bit, 1, 0, Result128.Low, Result128.High); } else { ///< Calculate the full operation. ///< BaseAddr doesn't need to exist, calculate that here. Ref BaseAddr = VSIB.BaseAddr; if (BaseAddr && VSIB.Displacement) { BaseAddr = Add(OpSize::i64Bit, BaseAddr, VSIB.Displacement); } else if (VSIB.Displacement) { BaseAddr = Constant(VSIB.Displacement); } else if (!BaseAddr) { BaseAddr = Invalid(); } Result = _VLoadVectorGatherMasked(Size, ElementLoadSize, Dest, Mask, BaseAddr, VSIB.Low, Invalid(), AddrElementSize, VSIB.Scale, 0, 0, AddrSize); } if (Is128Bit) { if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { // Special case for the 128-bit gather load using 64-bit address indexes with 32-bit results. // Only loads two 32-bit elements in to the lower 64-bits of the first destination. // Bits [255:65] all become zero. Result = _VMov(OpSize::i64Bit, Result); } else { Result = _VMov(OpSize::i128Bit, Result); } } else { if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) { // If we only fetched 128-bits worth of data then the upper-result is all zero. Result = _VMov(OpSize::i128Bit, Result); } } StoreResultFPR(Op, Result); ///< Assume non-faulting behaviour and clear the mask register. auto Zero = LoadZeroVector(Size); StoreResultFPR_WithOpSize(Op, Op->Src[1], Zero, Size); } template void OpDispatchBuilder::VPGATHER(OpcodeArgs); template void OpDispatchBuilder::VPGATHER(OpcodeArgs); void OpDispatchBuilder::Extrq_imm(OpcodeArgs) { const uint8_t MaskWidth = Op->Src[1].Literal() & 0x3F; const uint8_t Shift = (Op->Src[1].Literal() >> 8) & 0x3F; Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Result = Dest; if (Shift > 0) { Result = _VUShrI(OpSize::i64Bit, OpSize::i64Bit, Dest, Shift); } const uint64_t Mask = ~0ULL >> (MaskWidth == 0 ? 0 : (64 - MaskWidth)); const Ref MaskVector = _VCastFromGPR(OpSize::i128Bit, OpSize::i64Bit, _Constant(Mask)); Result = _VAnd(OpSize::i128Bit, OpSize::i64Bit, Result, MaskVector); StoreResultFPR(Op, Result); } void OpDispatchBuilder::Insertq_imm(OpcodeArgs) { const uint8_t MaskWidth = Op->Src[1].Literal() & 0x3F; const uint8_t Shift = (Op->Src[1].Literal() >> 8) & 0x3F; Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const uint64_t Mask = ~0ULL >> (MaskWidth == 0 ? 0 : (64 - MaskWidth)); Ref MaskVector = _VCastFromGPR(OpSize::i128Bit, OpSize::i64Bit, _Constant(Mask)); // Mask incoming source. Src = _VAnd(OpSize::i64Bit, OpSize::i64Bit, Src, MaskVector); // If shifting then shift source and mask in to the correct location. if (Shift) { Src = _VShlI(OpSize::i64Bit, OpSize::i64Bit, Src, Shift); MaskVector = _VShlI(OpSize::i128Bit, OpSize::i64Bit, MaskVector, Shift); } // Negate the mask. MaskVector = _VNot(OpSize::i64Bit, OpSize::i64Bit, MaskVector); Dest = _VAnd(OpSize::i64Bit, OpSize::i64Bit, Dest, MaskVector); const Ref Result = _VOr(OpSize::i64Bit, OpSize::i64Bit, Dest, Src); StoreResultFPR(Op, Result); } void OpDispatchBuilder::Extrq(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); const Ref ElementMask = _VectorImm(OpSize::i64Bit, OpSize::i64Bit, 0x3F); auto GenerateMask = [this](Ref VectorWidthInBits) -> Ref { const Ref VectorWidth = _VExtractToGPR(OpSize::i64Bit, OpSize::i64Bit, VectorWidthInBits, 0); return _VCastFromGPR(OpSize::i128Bit, OpSize::i64Bit, _MaskGenerateFromBitWidth(VectorWidth)); }; // Bits[5:0] = Mask width in bits const Ref MaskWidthBits = _VAnd(OpSize::i64Bit, OpSize::i64Bit, Src, ElementMask); // Bits[13:8] = Shift right in bits const Ref ShiftBits = _VAnd(OpSize::i64Bit, OpSize::i64Bit, _VUShrI(OpSize::i64Bit, OpSize::i64Bit, Src, 8), ElementMask); // First shift in to the correct position. Ref Result = _VUShr(OpSize::i64Bit, OpSize::i64Bit, Dest, ShiftBits, false); Result = _VAnd(OpSize::i128Bit, OpSize::i64Bit, Result, GenerateMask(MaskWidthBits)); StoreResultFPR(Op, Result); } void OpDispatchBuilder::Insertq(OpcodeArgs) { Ref Dest = LoadSourceFPR(Op, Op->Dest, Op->Flags); Ref Src = LoadSourceFPR(Op, Op->Src[0], Op->Flags); auto SelectorBits = _VDupElement(OpSize::i128Bit, OpSize::i64Bit, Src, 1); const Ref ElementMask = _VectorImm(OpSize::i64Bit, OpSize::i64Bit, 0x3F); auto GenerateMask = [this](Ref VectorWidthInBits) -> Ref { const Ref VectorWidth = _VExtractToGPR(OpSize::i64Bit, OpSize::i64Bit, VectorWidthInBits, 0); return _VCastFromGPR(OpSize::i128Bit, OpSize::i64Bit, _MaskGenerateFromBitWidth(VectorWidth)); }; // Bits[5:0] = Mask width in bits const Ref MaskWidthBits = _VAnd(OpSize::i64Bit, OpSize::i64Bit, SelectorBits, ElementMask); // Bits[13:8] = Shift right in bits const Ref ShiftBits = _VAnd(OpSize::i64Bit, OpSize::i64Bit, _VUShrI(OpSize::i64Bit, OpSize::i64Bit, SelectorBits, 8), ElementMask); // Extract the source data and put in to the correct location const Ref SrcMask = GenerateMask(MaskWidthBits); Ref SrcData = _VAnd(OpSize::i128Bit, OpSize::i64Bit, Src, SrcMask); SrcData = _VUShl(OpSize::i128Bit, OpSize::i64Bit, SrcData, ShiftBits, false); // Generate a destination mask const Ref DstMask = _VNot(OpSize::i64Bit, OpSize::i64Bit, _VUShl(OpSize::i128Bit, OpSize::i64Bit, SrcMask, ShiftBits, false)); Ref Result = _VAnd(OpSize::i64Bit, OpSize::i64Bit, Dest, DstMask); Result = _VOr(OpSize::i64Bit, OpSize::i64Bit, Result, SrcData); StoreResultFPR(Op, Result); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 x87 to IR $end_info$ */ #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/IR/IR.h" #include "Interface/Core/Addressing.h" #include #include #include #include #include #include #include namespace FEXCore::IR { class OrderedNode; #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op Ref OpDispatchBuilder::GetX87Top() { // Yes, we are storing 3 bits in a single flag register. // Deal with it return _LoadContextGPR(OpSize::i8Bit, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); } void OpDispatchBuilder::SetX87FTW(Ref FTW) { _StackForceSlow(); // Invalidate x87 FTW register cache // For the output, we want a 1-bit for each pair not equal to 11 (Empty). static_assert(static_cast(FPState::X87Tag::Empty) == 0b11); // Make even bits 1 if the pair is equal to 11, and 0 otherwise. FTW = _AndShift(OpSize::i32Bit, FTW, FTW, ShiftType::LSR, 1); // Invert FTW and clear the odd bits. Even bits are 1 if the pair // is not equal to 11, and odd bits are 0. FTW = _Andn(OpSize::i32Bit, Constant(0x55555555), FTW); // All that's left is to compact away the odd bits. That is a Morton // deinterleave operation, which has a standard solution. See // https://stackoverflow.com/questions/3137266/how-to-de-interleave-bits-unmortonizing FTW = _And(OpSize::i32Bit, _Orlshr(OpSize::i32Bit, FTW, FTW, 1), Constant(0x33333333)); FTW = _And(OpSize::i32Bit, _Orlshr(OpSize::i32Bit, FTW, FTW, 2), Constant(0x0f0f0f0f)); FTW = _Orlshr(OpSize::i32Bit, FTW, FTW, 4); // ...and that's it. StoreContext implicitly does the final masking. _StoreContextGPR(OpSize::i8Bit, FTW, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); } void OpDispatchBuilder::SetX87Top(Ref Value) { _StoreContextGPR(OpSize::i8Bit, Value, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); } // Float LoaD operation with memory operand void OpDispatchBuilder::FLD(OpcodeArgs, IR::OpSize Width) { Ref Data = LoadSourceFPR_WithOpSize(Op, Op->Src[0], Width, Op->Flags); Ref ConvertedData = Data; // Convert to 80bit float if (Width == OpSize::i32Bit || Width == OpSize::i64Bit) { ConvertedData = _F80CVTTo(Data, Width); } _PushStack(ConvertedData, Data, Width); } // Float LoaD operation with memory operand void OpDispatchBuilder::FLDFromStack(OpcodeArgs) { _CopyPushStack(Op->OP & 7); } void OpDispatchBuilder::FBLD(OpcodeArgs) { // Read from memory Ref Data = LoadSourceFPR_WithOpSize(Op, Op->Src[0], OpSize::f80Bit, Op->Flags); Ref ConvertedData = _F80BCDLoad(Data); _PushStack(ConvertedData, Invalid(), OpSize::iInvalid); } void OpDispatchBuilder::FBSTP(OpcodeArgs) { Ref converted = _F80BCDStore(_ReadStackValue(0)); StoreResultFPR_WithOpSize(Op, Op->Dest, converted, OpSize::f80Bit, OpSize::i8Bit); _PopStackDestroy(); } void OpDispatchBuilder::FLD_Const(OpcodeArgs, NamedVectorConstant K) { // Update TOP Ref Data = LoadAndCacheNamedVectorConstant(OpSize::i128Bit, K); _PushStack(Data, Data, OpSize::f80Bit); } void OpDispatchBuilder::FILD(OpcodeArgs) { const auto ReadWidth = OpSizeFromSrc(Op); // Read from memory Ref Data = LoadSourceGPR_WithOpSize(Op, Op->Src[0], ReadWidth, Op->Flags); // Sign extend to 64bits if (ReadWidth != OpSize::i64Bit) { Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data); } // We're about to clobber flags to grab the sign, so save NZCV. SaveNZCV(); // Extract sign and make integer absolute auto zero = Constant(0); _SubNZCV(OpSize::i64Bit, Data, zero); auto sign = _NZCVSelect(OpSize::i64Bit, CondClass::SLT, Constant(0x8000), zero); auto absolute = _Neg(OpSize::i64Bit, Data, CondClass::MI); // left justify the absolute integer auto shift = Sub(OpSize::i64Bit, Constant(63), _FindMSB(IR::OpSize::i64Bit, absolute)); auto shifted = _Lshl(OpSize::i64Bit, absolute, shift); auto adjusted_exponent = Sub(OpSize::i64Bit, Constant(0x3fff + 63), shift); auto zeroed_exponent = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::EQ, absolute, zero, zero, adjusted_exponent); auto upper = _Or(OpSize::i64Bit, sign, zeroed_exponent); Ref ConvertedData = _VLoadTwoGPRs(shifted, upper); _PushStack(ConvertedData, Invalid(), OpSize::iInvalid); } void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { LOGMAN_THROW_A_FMT(Width == OpSize::i32Bit || Width == OpSize::i64Bit || Width == OpSize::f80Bit, "Invalid store width for FST"); const auto SourceSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::f80Bit; AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false); A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Width); _StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } void OpDispatchBuilder::FSTToStack(OpcodeArgs) { const uint8_t Offset = Op->OP & 7; if (Offset != 0) { _StoreStackToStack(Offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } // Store integer to memory (possibly with truncation) void OpDispatchBuilder::FIST(OpcodeArgs, bool Truncate) { const auto Size = OpSizeFromSrc(Op); Ref Data = _ReadStackValue(0); // For 16-bit integers, we need to manually check for overflow // since _F80CVTInt doesn't handle 16-bit overflow detection properly if (Size == OpSize::i16Bit) { // Extract the 80-bit float value to check for special cases // Get the upper 64 bits which contain sign and exponent and then the exponent from upper. Ref Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Data, 1); Ref Exponent = _And(OpSize::i64Bit, Upper, Constant(0x7fff)); // Check for NaN/Infinity: exponent = 0x7fff SaveNZCV(); _TestNZ(OpSize::i64Bit, Exponent, Constant(0x7fff)); Ref IsSpecial = _NZCVSelect01(CondClass::EQ); // For overflow detection, check if exponent indicates a value >= 2^15 // Biased exponent for 2^15 is 0x3fff + 15 = 0x400e SubWithFlags(OpSize::i64Bit, Exponent, 0x400e); Ref IsOverflow = _NZCVSelect01(CondClass::UGE); // Set Invalid Operation flag if overflow or special value Ref InvalidFlag = _Or(OpSize::i64Bit, IsSpecial, IsOverflow); SetRFLAG(InvalidFlag); } Data = _F80CVTInt(Size, Data, Truncate); StoreResultGPR_WithOpSize(Op, Op->Dest, Data, Size, OpSize::i8Bit); if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) { _PopStackDestroy(); } } void OpDispatchBuilder::FADD(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { // Implicit argument case auto Offset = Op->OP & 7; auto St0 = 0; if (ResInST0 == OpResult::RES_STI) { _F80AddStack(Offset, St0); } else { _F80AddStack(St0, Offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } LOGMAN_THROW_A_FMT(Width != OpSize::f80Bit, "No 80-bit floats from memory"); // We have one memory argument Ref Arg {}; if (Integer) { Arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Arg = _F80CVTToInt(Arg, Width); } else { Arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Arg = _F80CVTTo(Arg, Width); } // top of stack is at offset zero _F80AddValue(0, Arg); } void OpDispatchBuilder::FMUL(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { // Implicit argument case auto offset = Op->OP & 7; auto st0 = 0; if (ResInST0 == OpResult::RES_STI) { _F80MulStack(offset, st0); } else { _F80MulStack(st0, offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } LOGMAN_THROW_A_FMT(Width != OpSize::f80Bit, "No 80-bit floats from memory"); // We have one memory argument Ref arg {}; if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); arg = _F80CVTToInt(arg, Width); } else { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); arg = _F80CVTTo(arg, Width); } // top of stack is at offset zero _F80MulValue(0, arg); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { const uint8_t Offset = Op->OP & 7; const uint8_t St0 = 0; const uint8_t Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80DivStack(Result, Offset, St0); } else { _F80DivStack(Result, St0, Offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } LOGMAN_THROW_A_FMT(Width != OpSize::f80Bit, "No 80-bit floats from memory"); // We have one memory argument Ref arg {}; if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); arg = _F80CVTToInt(arg, Width); } else { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); arg = _F80CVTTo(arg, Width); } // top of stack is at offset zero if (Reverse) { _F80DivRValue(arg, 0); } else { _F80DivValue(0, arg); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { const auto Offset = Op->OP & 7; const auto St0 = 0; const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80SubStack(Result, Offset, St0); } else { _F80SubStack(Result, St0, Offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } LOGMAN_THROW_A_FMT(Width != OpSize::f80Bit, "No 80-bit floats from memory"); // We have one memory argument Ref Arg {}; if (Integer) { Arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Arg = _F80CVTToInt(Arg, Width); } else { Arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Arg = _F80CVTTo(Arg, Width); } // top of stack is at offset zero if (Reverse) { _F80SubRValue(Arg, 0); } else { _F80SubValue(0, Arg); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } Ref OpDispatchBuilder::GetX87FTW_Helper() { // AbridgedFTWIndex has 1-bit per slot (8 slots). Duplicate each bit to get // 2-bits per slot (16-bit result). Duplicating bits is equivalent to // Morton interleaving a number with itself. To interleave efficiently two // bytes, we use the well-known bit twiddling algorithm: // // https://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN Ref X = _LoadContextGPR(OpSize::i8Bit, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); X = _Orlshl(OpSize::i32Bit, X, X, 4); X = _And(OpSize::i32Bit, X, Constant(0x0f0f0f0f)); X = _Orlshl(OpSize::i32Bit, X, X, 2); X = _And(OpSize::i32Bit, X, Constant(0x33333333)); X = _Orlshl(OpSize::i32Bit, X, X, 1); X = _And(OpSize::i32Bit, X, Constant(0x55555555)); X = _Orlshl(OpSize::i32Bit, X, X, 1); // The above sequence sets valid to 11 and empty to 00, so invert to finalize. static_assert(static_cast(FPState::X87Tag::Valid) == 0b00); static_assert(static_cast(FPState::X87Tag::Empty) == 0b11); return _Xor(OpSize::i32Bit, X, Constant(0xffff)); } void OpDispatchBuilder::X87FNSTENV(OpcodeArgs) { // 14 bytes for 16bit // 2 Bytes : FCW // 2 Bytes : FSW // 2 bytes : FTW // 2 bytes : Instruction offset // 2 bytes : Instruction CS selector // 2 bytes : Data offset // 2 bytes : Data selector // 28 bytes for 32bit // 4 bytes : FCW // 4 bytes : FSW // 4 bytes : FTW // 4 bytes : Instruction pointer // 2 bytes : Instruction pointer selector // 2 bytes : Opcode // 4 bytes : data pointer offset // 4 bytes : data pointer selector // Before we store anything we need to sync our stack to the registers. _SyncStackToSlow(); const auto Size = OpSizeFromSrc(Op); Ref Mem = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); Mem = AppendSegmentOffset(Mem, Op->Flags); { auto FCW = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, FCW)); _StoreMemGPR(Size, Mem, FCW, Size); } { _StoreMemGPR(Size, ReconstructFSW_Helper(), Mem, Constant(IR::OpSizeToSize(Size) * 1), Size, MemOffsetType::SXTX, 1); } auto ZeroConst = Constant(0); { // FTW _StoreMemGPR(Size, GetX87FTW_Helper(), Mem, Constant(IR::OpSizeToSize(Size) * 2), Size, MemOffsetType::SXTX, 1); } { // Instruction Offset _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 3), Size, MemOffsetType::SXTX, 1); } { // Instruction CS selector (+ Opcode) _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 4), Size, MemOffsetType::SXTX, 1); } { // Data pointer offset _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 5), Size, MemOffsetType::SXTX, 1); } { // Data pointer selector _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 6), Size, MemOffsetType::SXTX, 1); } } Ref OpDispatchBuilder::ReconstructX87StateFromFSW_Helper(Ref FSW) { auto Top = _Bfe(OpSize::i32Bit, 3, 11, FSW); SetX87Top(Top); auto C0 = _Bfe(OpSize::i32Bit, 1, 8, FSW); auto C1 = _Bfe(OpSize::i32Bit, 1, 9, FSW); auto C2 = _Bfe(OpSize::i32Bit, 1, 10, FSW); auto C3 = _Bfe(OpSize::i32Bit, 1, 14, FSW); auto IE = _Bfe(OpSize::i32Bit, 1, 0, FSW); SetRFLAG(C0); SetRFLAG(C1); SetRFLAG(C2); SetRFLAG(C3); SetRFLAG(IE); return Top; } void OpDispatchBuilder::X87LDENV(OpcodeArgs) { _StackForceSlow(); const auto Size = OpSizeFromSrc(Op); Ref Mem = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.LoadData = false}); Mem = AppendSegmentOffset(Mem, Op->Flags); auto NewFCW = _LoadMemGPR(OpSize::i16Bit, Mem, OpSize::i16Bit); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); Ref MemLocation = Add(OpSize::i64Bit, Mem, IR::OpSizeToSize(Size) * 1); auto NewFSW = _LoadMemGPR(Size, MemLocation, Size); ReconstructX87StateFromFSW_Helper(NewFSW); { // FTW Ref MemLocation = Add(OpSize::i64Bit, Mem, IR::OpSizeToSize(Size) * 2); SetX87FTW(_LoadMemGPR(Size, MemLocation, Size)); } } void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) { _SyncStackToSlow(); // 14 bytes for 16bit // 2 Bytes : FCW // 2 Bytes : FSW // 2 bytes : FTW // 2 bytes : Instruction offset // 2 bytes : Instruction CS selector // 2 bytes : Data offset // 2 bytes : Data selector // 28 bytes for 32bit // 4 bytes : FCW // 4 bytes : FSW // 4 bytes : FTW // 4 bytes : Instruction pointer // 2 bytes : instruction pointer selector // 2 bytes : Opcode // 4 bytes : data pointer offset // 4 bytes : data pointer selector const auto Size = OpSizeFromDst(Op); Ref Mem = MakeSegmentAddress(Op, Op->Dest); Ref Top = GetX87Top(); { auto FCW = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, FCW)); _StoreMemGPR(Size, Mem, FCW, Size); } { _StoreMemGPR(Size, ReconstructFSW_Helper(), Mem, Constant(IR::OpSizeToSize(Size) * 1), Size, MemOffsetType::SXTX, 1); } auto ZeroConst = Constant(0); { // FTW _StoreMemGPR(Size, GetX87FTW_Helper(), Mem, Constant(IR::OpSizeToSize(Size) * 2), Size, MemOffsetType::SXTX, 1); } { // Instruction Offset _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 3), Size, MemOffsetType::SXTX, 1); } { // Instruction CS selector (+ Opcode) _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 4), Size, MemOffsetType::SXTX, 1); } { // Data pointer offset _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 5), Size, MemOffsetType::SXTX, 1); } { // Data pointer selector _StoreMemGPR(Size, ZeroConst, Mem, Constant(IR::OpSizeToSize(Size) * 6), Size, MemOffsetType::SXTX, 1); } auto SevenConst = Constant(7); const auto LoadSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; for (int i = 0; i < 7; ++i) { Ref data = _LoadContextFPRIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit)); if (ReducedPrecisionMode) { data = _F80CVTTo(data, OpSize::i64Bit); } _StoreMemFPR(OpSize::i128Bit, data, Mem, Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MemOffsetType::SXTX, 1); Top = _And(OpSize::i32Bit, Add(OpSize::i32Bit, Top, 1), SevenConst); } // The final st(7) needs a bit of special handling here Ref data = _LoadContextFPRIndexed(Top, LoadSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit)); if (ReducedPrecisionMode) { data = _F80CVTTo(data, OpSize::i64Bit); } // ST7 broken in to two parts // Lower 64bits [63:0] // upper 16 bits [79:64] _StoreMemFPR(OpSize::i64Bit, data, Mem, Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10)), OpSize::i8Bit, MemOffsetType::SXTX, 1); auto topBytes = _VDupElement(OpSize::i128Bit, OpSize::i16Bit, data, 4); _StoreMemFPR(OpSize::i16Bit, topBytes, Mem, Constant((IR::OpSizeToSize(Size) * 7) + (7 * 10) + 8), OpSize::i8Bit, MemOffsetType::SXTX, 1); // reset to default FNINIT(Op); } void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) { _StackForceSlow(); const auto Size = OpSizeFromSrc(Op); Ref Mem = MakeSegmentAddress(Op, Op->Src[0]); auto NewFCW = _LoadMemGPR(OpSize::i16Bit, Mem, OpSize::i16Bit); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); if (ReducedPrecisionMode) { // ignore the rounding precision, we're always 64-bit in F64. // extract rounding mode Ref roundingMode = NewFCW; auto roundShift = Constant(10); auto roundMask = Constant(3); roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift); roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask); _SetRoundingMode(roundingMode, false, roundingMode); } auto NewFSW = _LoadMemGPR(Size, Mem, Constant(IR::OpSizeToSize(Size) * 1), Size, MemOffsetType::SXTX, 1); Ref Top = ReconstructX87StateFromFSW_Helper(NewFSW); { // FTW SetX87FTW(_LoadMemGPR(Size, Mem, Constant(IR::OpSizeToSize(Size) * 2), Size, MemOffsetType::SXTX, 1)); } auto SevenConst = Constant(7); auto low = Constant(~0ULL); auto high = Constant(0xFFFF); Ref Mask = _VLoadTwoGPRs(low, high); const auto StoreSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; for (int i = 0; i < 7; ++i) { Ref Reg = _LoadMemFPR(OpSize::i128Bit, Mem, Constant((IR::OpSizeToSize(Size) * 7) + (10 * i)), OpSize::i8Bit, MemOffsetType::SXTX, 1); // Mask off the top bits Reg = _VAnd(OpSize::i128Bit, OpSize::i128Bit, Reg, Mask); if (ReducedPrecisionMode) { // Convert to double precision Reg = _F80CVT(OpSize::i64Bit, Reg); } _StoreContextFPRIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit)); Top = _And(OpSize::i32Bit, Add(OpSize::i32Bit, Top, 1), SevenConst); } // The final st(7) needs a bit of special handling here // ST7 broken in to two parts // Lower 64bits [63:0] // upper 16 bits [79:64] Ref Reg = _LoadMemFPR(OpSize::i64Bit, Mem, Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7)), OpSize::i8Bit, MemOffsetType::SXTX, 1); Ref RegHigh = _LoadMemFPR(OpSize::i16Bit, Mem, Constant((IR::OpSizeToSize(Size) * 7) + (10 * 7) + 8), OpSize::i8Bit, MemOffsetType::SXTX, 1); Reg = _VInsElement(OpSize::i128Bit, OpSize::i16Bit, 4, 0, Reg, RegHigh); if (ReducedPrecisionMode) { Reg = _F80CVT(OpSize::i64Bit, Reg); // Convert to double precision } _StoreContextFPRIndexed(Reg, Top, StoreSize, MMBaseOffset(), IR::OpSizeToSize(OpSize::i128Bit)); } // Load / Store Control Word void OpDispatchBuilder::X87FSTCW(OpcodeArgs) { auto FCW = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, FCW)); StoreResultGPR(Op, FCW); } void OpDispatchBuilder::X87FLDCW(OpcodeArgs) { // FIXME: Because loading control flags will affect several instructions in fast path, we might have // to switch for now to slow mode whenever these are manually changed. // Remove the next line and try DF_04.asm in fast path. _StackForceSlow(); Ref NewFCW = LoadSourceGPR(Op, Op->Src[0], Op->Flags); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); } void OpDispatchBuilder::FXCH(OpcodeArgs) { uint8_t Offset = Op->OP & 7; // fxch st0, st0 is for us essentially a nop if (Offset != 0) { _F80StackXchange(Offset); } SetRFLAG(Constant(0)); } void OpDispatchBuilder::X87FYL2X(OpcodeArgs, bool IsFYL2XP1) { if (IsFYL2XP1) { // create an add between top of stack and 1. Ref One = ReducedPrecisionMode ? _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, Constant(0x3FF0000000000000)) : LoadAndCacheNamedVectorConstant(OpSize::i128Bit, NamedVectorConstant::NAMED_VECTOR_X87_ONE); _F80AddValue(0, One); } _F80FYL2XStack(); } void OpDispatchBuilder::FCOMI(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::FCOMIFlags WhichFlags, bool PopTwice) { Ref arg {}; Ref b {}; Ref Res {}; if (Op->Src[0].IsNone()) { // Implicit arg uint8_t Offset = Op->OP & 7; Res = _F80CmpStack(Offset); } else { if (Width == OpSize::i16Bit || Width == OpSize::i32Bit || Width == OpSize::i64Bit) { // Memory arg if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); b = _F80CVTToInt(arg, Width); } else { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); b = _F80CVTTo(arg, Width); } } else { FEX_UNREACHABLE; } Res = _F80CmpValue(b); } Ref HostFlag_CF = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_LT, Res); Ref HostFlag_ZF = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_EQ, Res); Ref HostFlag_Unordered = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_UNORDERED, Res); HostFlag_CF = _Or(OpSize::i32Bit, HostFlag_CF, HostFlag_Unordered); HostFlag_ZF = _Or(OpSize::i32Bit, HostFlag_ZF, HostFlag_Unordered); if (WhichFlags == FCOMIFlags::FLAGS_X87) { SetRFLAG(HostFlag_CF); SetRFLAG(Constant(0)); SetRFLAG(HostFlag_Unordered); SetRFLAG(HostFlag_ZF); } else { // OF, SF, AF, PF all undefined SetCFDirect(HostFlag_CF); SetRFLAG(HostFlag_ZF); // PF is stored inverted, so invert from the host flag. // TODO: This could perhaps be optimized? auto PF = _Xor(OpSize::i32Bit, HostFlag_Unordered, Constant(1)); SetRFLAG(PF); } // Set Invalid Operation flag when unordered (NaN comparison) SetRFLAG(HostFlag_Unordered); if (PopTwice) { _PopStackDestroy(); _PopStackDestroy(); } else if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) { _PopStackDestroy(); } } void OpDispatchBuilder::FTST(OpcodeArgs) { Ref Res = _F80StackTest(0); Ref HostFlag_CF = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_LT, Res); Ref HostFlag_ZF = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_EQ, Res); Ref HostFlag_Unordered = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_UNORDERED, Res); HostFlag_CF = _Or(OpSize::i32Bit, HostFlag_CF, HostFlag_Unordered); HostFlag_ZF = _Or(OpSize::i32Bit, HostFlag_ZF, HostFlag_Unordered); SetRFLAG(HostFlag_CF); SetRFLAG(Constant(0)); SetRFLAG(HostFlag_Unordered); SetRFLAG(HostFlag_ZF); // Set Invalid Operation flag when unordered (NaN comparison) SetRFLAG(HostFlag_Unordered); } void OpDispatchBuilder::X87OpHelper(OpcodeArgs, FEXCore::IR::IROps IROp, bool ZeroC2) { DeriveOp(Result, IROp, _F80SCALEStack()); if (ZeroC2) { SetRFLAG(Constant(0)); } } void OpDispatchBuilder::X87ModifySTP(OpcodeArgs, bool Inc) { if (Inc) { _IncStackTop(); } else { _DecStackTop(); } } // Operations dealing with loading and storing environment pieces // Reconstruct as a constant the Status Word of the FPU. // We only track stack top and each of the code conditions (C flags) // Top is 3 bits at bit 11. // C0 is 1 bit at bit 8. // C1 is 1 bit at bit 9. // C2 is 1 bit at bit 10. // C3 is 1 bit at bit 14. // Optionally we can pass a pre calculated value for Top, otherwise we calculate it // during the function runtime. Ref OpDispatchBuilder::ReconstructFSW_Helper(Ref T) { // Start with the top value auto Top = T ? T : GetX87Top(); Ref FSW = _Lshl(OpSize::i64Bit, Top, Constant(11)); // We must construct the FSW from our various bits auto C0 = GetRFLAG(FEXCore::X86State::X87FLAG_C0_LOC); FSW = _Orlshl(OpSize::i64Bit, FSW, C0, 8); auto C1 = GetRFLAG(FEXCore::X86State::X87FLAG_C1_LOC); FSW = _Orlshl(OpSize::i64Bit, FSW, C1, 9); auto C2 = GetRFLAG(FEXCore::X86State::X87FLAG_C2_LOC); FSW = _Orlshl(OpSize::i64Bit, FSW, C2, 10); auto C3 = GetRFLAG(FEXCore::X86State::X87FLAG_C3_LOC); FSW = _Orlshl(OpSize::i64Bit, FSW, C3, 14); auto IE = GetRFLAG(FEXCore::X86State::X87FLAG_IE_LOC); FSW = _Or(OpSize::i64Bit, FSW, IE); return FSW; } // Store Status Word // There's no load Status Word instruction but you can load it through frstor // or fldenv. void OpDispatchBuilder::X87FNSTSW(OpcodeArgs) { Ref TopValue = _SyncStackToSlow(); Ref StatusWord = ReconstructFSW_Helper(TopValue); StoreResultGPR(Op, StatusWord); } void OpDispatchBuilder::FNCLEX(OpcodeArgs) { // Clear the exception flag bit SetRFLAG(_Constant(0)); } void OpDispatchBuilder::FNINIT(OpcodeArgs) { _SyncStackToSlow(); // Invalidate x87 register caches auto Zero = Constant(0); if (ReducedPrecisionMode) { _SetRoundingMode(Zero, false, Zero); } // Init FCW to 0x037F auto NewFCW = Constant(0x037F); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); // Set top to zero SetX87Top(Zero); // Tags all get marked as invalid _StoreContextGPR(OpSize::i8Bit, Zero, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); // Reinits the simulated stack _InitStack(); SetRFLAG(Zero); SetRFLAG(Zero); SetRFLAG(Zero); SetRFLAG(Zero); SetRFLAG(Zero); } void OpDispatchBuilder::X87FFREE(OpcodeArgs) { _InvalidateStack(Op->OP & 7); } void OpDispatchBuilder::X87EMMS(OpcodeArgs) { // Tags all get set to 0b11 _InvalidateStack(0xff); } void OpDispatchBuilder::X87FCMOV(OpcodeArgs) { CalculateDeferredFlags(); uint16_t Opcode = Op->OP & 0b1111'1111'1000; uint8_t CC = 0; switch (Opcode) { case 0x3'C0: CC = 0x3; // JNC break; case 0x2'C0: CC = 0x2; // JC break; case 0x2'C8: CC = 0x4; // JE break; case 0x3'C8: CC = 0x5; // JNE break; case 0x2'D0: CC = 0x6; // JNA break; case 0x3'D0: CC = 0x7; // JA break; case 0x2'D8: CC = 0xA; // JP break; case 0x3'D8: CC = 0xB; // JNP break; default: LOGMAN_MSG_A_FMT("Unhandled FCMOV op: 0x{:x}", Opcode); break; } Ref VecCond = _VDupFromGPR(OpSize::i128Bit, OpSize::i64Bit, SelectCC0All1(CC)); _F80VBSLStack(OpSize::i128Bit, VecCond, Op->OP & 7, 0); } void OpDispatchBuilder::X87FXAM(OpcodeArgs) { auto a = _ReadStackValue(0); Ref Result = ReducedPrecisionMode ? _VExtractToGPR(OpSize::i64Bit, OpSize::i64Bit, a, 0) : _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, a, 1); // Extract the sign bit Result = ReducedPrecisionMode ? _Bfe(OpSize::i64Bit, 1, 63, Result) : _Bfe(OpSize::i64Bit, 1, 15, Result); SetRFLAG(Result); // Claim this is a normal number // We don't support anything else auto TopValid = _StackValidTag(0); // In the case of top being invalid then C3:C2:C0 is 0b101 auto C3 = Select01(OpSize::i32Bit, CondClass::NEQ, TopValid, Constant(1)); auto C2 = TopValid; auto C0 = C3; // Mirror C3 until something other than zero is supported SetRFLAG(C0); SetRFLAG(C2); SetRFLAG(C3); } void OpDispatchBuilder::X87FXTRACT(OpcodeArgs) { auto Top = _ReadStackValue(0); _PopStackDestroy(); auto Exp = _F80XTRACT_EXP(Top); auto Sig = _F80XTRACT_SIG(Top); _PushStack(Exp, Invalid(), OpSize::iInvalid); _PushStack(Sig, Invalid(), OpSize::iInvalid); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 x87 to IR $end_info$ */ #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/IR/IR.h" #include #include #include #include #include #include namespace FEXCore::IR { class OrderedNode; #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) { _StackForceSlow(); const auto Size = OpSizeFromSrc(Op); Ref Mem = MakeSegmentAddress(Op, Op->Src[0]); auto NewFCW = _LoadMemGPR(OpSize::i16Bit, Mem, OpSize::i16Bit); // ignore the rounding precision, we're always 64-bit in F64. // extract rounding mode Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW); _SetRoundingMode(roundingMode, false, roundingMode); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); auto NewFSW = _LoadMemGPR(Size, Mem, Constant(IR::OpSizeToSize(Size)), Size, MemOffsetType::SXTX, 1); ReconstructX87StateFromFSW_Helper(NewFSW); { // FTW SetX87FTW(_LoadMemGPR(Size, Mem, Constant(IR::OpSizeToSize(Size) * 2), Size, MemOffsetType::SXTX, 1)); } } void OpDispatchBuilder::X87FLDCWF64(OpcodeArgs) { _StackForceSlow(); Ref NewFCW = LoadSourceGPR(Op, Op->Src[0], Op->Flags); // ignore the rounding precision, we're always 64-bit in F64. // extract rounding mode Ref roundingMode = _Bfe(OpSize::i32Bit, 3, 10, NewFCW); _SetRoundingMode(roundingMode, false, roundingMode); _StoreContextGPR(OpSize::i16Bit, NewFCW, offsetof(FEXCore::Core::CPUState, FCW)); } // F64 ops // Float load op with memory operand void OpDispatchBuilder::FLDF64(OpcodeArgs, IR::OpSize Width) { Ref Data = LoadSourceFPR_WithOpSize(Op, Op->Src[0], Width, Op->Flags); // Convert to 64bit float Ref ConvertedData = Data; if (Width == OpSize::i32Bit) { ConvertedData = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, Data); } else if (Width == OpSize::f80Bit) { ConvertedData = _F80CVT(OpSize::i64Bit, Data); } _PushStack(ConvertedData, Data, Width); } void OpDispatchBuilder::FBLDF64(OpcodeArgs) { // Read from memory Ref Data = LoadSourceFPR_WithOpSize(Op, Op->Src[0], OpSize::f80Bit, Op->Flags); Ref ConvertedData = _F80BCDLoad(Data); ConvertedData = _F80CVT(OpSize::i64Bit, ConvertedData); _PushStack(ConvertedData, Invalid(), OpSize::iInvalid); } void OpDispatchBuilder::FBSTPF64(OpcodeArgs) { Ref converted = _F80CVTTo(_ReadStackValue(0), OpSize::i64Bit); converted = _F80BCDStore(converted); StoreResultFPR_WithOpSize(Op, Op->Dest, converted, OpSize::f80Bit, OpSize::i8Bit); _PopStackDestroy(); } void OpDispatchBuilder::FLDF64_Const(OpcodeArgs, uint64_t Num) { auto Data = _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, Constant(Num)); _PushStack(Data, Data, OpSize::i64Bit); } void OpDispatchBuilder::FILDF64(OpcodeArgs) { const auto ReadWidth = OpSizeFromSrc(Op); // Read from memory Ref Data = LoadSourceGPR_WithOpSize(Op, Op->Src[0], ReadWidth, Op->Flags); if (ReadWidth == OpSize::i16Bit) { Data = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(ReadWidth), 0, Data); } auto ConvertedData = _Float_FromGPR_S(OpSize::i64Bit, ReadWidth == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, Data); _PushStack(ConvertedData, Invalid(), OpSize::iInvalid); } void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) { const auto Size = OpSizeFromSrc(Op); Ref data = _ReadStackValue(0); if (Truncate) { data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); } else { data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); } StoreResultGPR_WithOpSize(Op, Op->Dest, data, Size, OpSize::i8Bit); if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) { _PopStackDestroy(); } } void OpDispatchBuilder::FADDF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { // Implicit argument case auto Offset = Op->OP & 7; auto St0 = 0; if (ResInST0 == OpResult::RES_STI) { _F80AddStack(Offset, St0); } else { _F80AddStack(St0, Offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } // We have one memory argument Ref arg {}; if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); if (Width == OpSize::i16Bit) { arg = _Sbfe(OpSize::i64Bit, 16, 0, arg); } arg = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg); } else if (Width == OpSize::i32Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); arg = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg); } else if (Width == OpSize::i64Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } else { FEX_UNREACHABLE; } // top of stack is at offset zero _F80AddValue(0, arg); } // FIXME: following is very similar to FADDF64 void OpDispatchBuilder::FMULF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { // Implicit argument case auto offset = Op->OP & 7; auto st0 = 0; if (ResInST0 == OpResult::RES_STI) { _F80MulStack(offset, st0); } else { _F80MulStack(st0, offset); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } // We have one memory argument Ref arg {}; if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); if (Width == OpSize::i16Bit) { arg = _Sbfe(OpSize::i64Bit, 16, 0, arg); } arg = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg); } else if (Width == OpSize::i32Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); arg = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg); } else if (Width == OpSize::i64Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } else { FEX_UNREACHABLE; } // top of stack is at offset zero _F80MulValue(0, arg); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } void OpDispatchBuilder::FDIVF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { const auto offset = Op->OP & 7; const auto st0 = 0; if (Reverse) { if (ResInST0 == OpResult::RES_STI) { _F80DivStack(offset, st0, offset); } else { _F80DivStack(st0, offset, st0); } } else { if (ResInST0 == OpResult::RES_STI) { _F80DivStack(offset, offset, st0); } else { _F80DivStack(st0, st0, offset); } } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } // We have one memory argument Ref Arg {}; if (Width == OpSize::i16Bit || Width == OpSize::i32Bit || Width == OpSize::i64Bit) { if (Integer) { Arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); if (Width == OpSize::i16Bit) { Arg = _Sbfe(OpSize::i64Bit, 16, 0, Arg); } Arg = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, Arg); } else if (Width == OpSize::i32Bit) { Arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); Arg = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, Arg); } else if (Width == OpSize::i64Bit) { Arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } } else { FEX_UNREACHABLE; } // top of stack is at offset zero if (Reverse) { _F80DivRValue(Arg, 0); } else { _F80DivValue(0, Arg); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } void OpDispatchBuilder::FSUBF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { const auto Offset = Op->OP & 7; const auto St0 = 0; if (Reverse) { if (ResInST0 == OpResult::RES_STI) { _F80SubStack(Offset, St0, Offset); } else { _F80SubStack(St0, Offset, St0); } } else { if (ResInST0 == OpResult::RES_STI) { _F80SubStack(Offset, Offset, St0); } else { _F80SubStack(St0, St0, Offset); } } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } return; } // We have one memory argument Ref arg {}; if (Width == OpSize::i16Bit || Width == OpSize::i32Bit || Width == OpSize::i64Bit) { if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); if (Width == OpSize::i16Bit) { arg = _Sbfe(OpSize::i64Bit, 16, 0, arg); } arg = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg); } else if (Width == OpSize::i32Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); arg = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg); } else if (Width == OpSize::i64Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } } else { FEX_UNREACHABLE; } // top of stack is at offset zero if (Reverse) { _F80SubRValue(arg, 0); } else { _F80SubValue(0, arg); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } } void OpDispatchBuilder::FTSTF64(OpcodeArgs) { // We are going to clobber NZCV, make sure it's in a GPR first. SaveNZCV(); // Now we do our comparison. _F80StackTest(0); ConvertNZCVToX87(); } void OpDispatchBuilder::FCOMIF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::FCOMIFlags WhichFlags, bool PopTwice) { Ref arg {}; Ref b {}; if (Op->Src[0].IsNone()) { // Implicit arg uint8_t offset = Op->OP & 7; b = _ReadStackValue(offset); } else if (Width == OpSize::i16Bit || Width == OpSize::i32Bit || Width == OpSize::i64Bit) { // Memory arg if (Integer) { arg = LoadSourceGPR(Op, Op->Src[0], Op->Flags); if (Width == OpSize::i16Bit) { arg = _Sbfe(OpSize::i64Bit, 16, 0, arg); } b = _Float_FromGPR_S(OpSize::i64Bit, Width == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, arg); } else if (Width == OpSize::i32Bit) { arg = LoadSourceFPR(Op, Op->Src[0], Op->Flags); b = _Float_FToF(OpSize::i64Bit, OpSize::i32Bit, arg); } else if (Width == OpSize::i64Bit) { b = LoadSourceFPR(Op, Op->Src[0], Op->Flags); } } else { FEX_UNREACHABLE; } if (WhichFlags == FCOMIFlags::FLAGS_X87) { // We are going to clobber NZCV, make sure it's in a GPR first. SaveNZCV(); _F80CmpValue(b); ConvertNZCVToX87(); } else { HandleNZCVWrite(); _F80CmpValue(b); ComissFlags(true /* InvalidateAF */); } if (PopTwice) { _PopStackDestroy(); _PopStackDestroy(); } else if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) { _PopStackDestroy(); } } void OpDispatchBuilder::X87FXTRACTF64(OpcodeArgs) { // Split node into SIG and EXP while handling the special zero case. // i.e. if val == 0.0, then sig = 0.0, exp = -inf // if val == -0.0, then sig = -0.0, exp = -inf // otherwise we just extract the 64-bit sig and exp as normal. Ref Node = _ReadStackValue(0); Ref Gpr = _VExtractToGPR(OpSize::i64Bit, OpSize::i64Bit, Node, 0); // zero case Ref ExpZV = _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, Constant(0xfff0'0000'0000'0000UL)); Ref SigZV = Node; // non zero case Ref ExpNZ = _Bfe(OpSize::i64Bit, 11, 52, Gpr); ExpNZ = Sub(OpSize::i64Bit, ExpNZ, Constant(1023)); Ref ExpNZV = _Float_FromGPR_S(OpSize::i64Bit, OpSize::i64Bit, ExpNZ); Ref SigNZ = _And(OpSize::i64Bit, Gpr, Constant(0x800f'ffff'ffff'ffffLL)); SigNZ = _Or(OpSize::i64Bit, SigNZ, Constant(0x3ff0'0000'0000'0000LL)); Ref SigNZV = _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, SigNZ); // Comparison and select to push onto stack SaveNZCV(); _TestNZ(OpSize::i64Bit, Gpr, Constant(0x7fff'ffff'ffff'ffffUL)); Ref Sig = _NZCVSelectV(OpSize::i64Bit, CondClass::EQ, SigZV, SigNZV); Ref Exp = _NZCVSelectV(OpSize::i64Bit, CondClass::EQ, ExpZV, ExpNZV); _PopStackDestroy(); _PushStack(Exp, Invalid(), OpSize::iInvalid); _PushStack(Sig, Invalid(), OpSize::iInvalid); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-to-ir, opcodes|dispatcher-implementations desc: Handles x86/64 ops to IR, no-pf opt, local-flags opt $end_info$ */ #include "FEXCore/Core/HostFeatures.h" #include "FEXCore/Utils/Telemetry.h" #include "Interface/Context/Context.h" #include "Interface/Core/OpcodeDispatcher.h" #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::IR { using X86Tables::OpToIndex; #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op void OpDispatchBuilder::SyscallOp(OpcodeArgs, bool IsSyscallInst) { constexpr size_t SyscallArgs = 7; using SyscallArray = std::array; size_t NumArguments {}; const SyscallArray* GPRIndexes {}; static constexpr SyscallArray GPRIndexes_64 = { FEXCore::X86State::REG_RAX, FEXCore::X86State::REG_RDI, FEXCore::X86State::REG_RSI, FEXCore::X86State::REG_RDX, FEXCore::X86State::REG_R10, FEXCore::X86State::REG_R8, FEXCore::X86State::REG_R9, }; static constexpr SyscallArray GPRIndexes_32 = { FEXCore::X86State::REG_RAX, FEXCore::X86State::REG_RBX, FEXCore::X86State::REG_RCX, FEXCore::X86State::REG_RDX, FEXCore::X86State::REG_RSI, FEXCore::X86State::REG_RDI, FEXCore::X86State::REG_RBP, }; const auto OSABI = CTX->SyscallHandler->GetOSABI(); if (OSABI == FEXCore::HLE::SyscallOSABI::OS_LINUX64) { NumArguments = GPRIndexes_64.size(); GPRIndexes = &GPRIndexes_64; } else if (OSABI == FEXCore::HLE::SyscallOSABI::OS_LINUX32) { NumArguments = GPRIndexes_32.size(); GPRIndexes = &GPRIndexes_32; } else if (OSABI == FEXCore::HLE::SyscallOSABI::OS_GENERIC) { // All registers will be spilled before the syscall and filled afterwards so no JIT-side argument handling is necessary. NumArguments = 0; GPRIndexes = nullptr; } else { ERROR_AND_DIE_FMT("Unhandled OSABI syscall"); } // Calculate flags early. CalculateDeferredFlags(); const auto GPRSize = GetGPROpSize(); auto NewRIP = GetRelocatedPC(Op, -Op->InstSize); _StoreContextGPR(GPRSize, NewRIP, offsetof(FEXCore::Core::CPUState, rip)); Ref Arguments[SyscallArgs] { InvalidNode, InvalidNode, InvalidNode, InvalidNode, InvalidNode, InvalidNode, InvalidNode, }; for (size_t i = 0; i < NumArguments; ++i) { Arguments[i] = LoadGPRRegister(GPRIndexes->at(i)); } if (IsSyscallInst) { // If this is the `Syscall` instruction rather than `int 0x80` then we need to do some additional work. // RCX = RIP after this instruction // R11 = EFlags // Calculate flags. CalculateDeferredFlags(); auto RFLAG = GetPackedRFLAG(); StoreGPRRegister(X86State::REG_R11, RFLAG, OpSize::i64Bit); auto RIPAfterInst = GetRelocatedPC(Op); StoreGPRRegister(X86State::REG_RCX, RIPAfterInst, OpSize::i64Bit); } FlushRegisterCache(); auto SyscallOp = _Syscall(Arguments[0], Arguments[1], Arguments[2], Arguments[3], Arguments[4], Arguments[5], Arguments[6]); // Generic ABI doesn't store result in RAX. if (OSABI != FEXCore::HLE::SyscallOSABI::OS_GENERIC) { StoreGPRRegister(X86State::REG_RAX, SyscallOp); } if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_BLOCK_END) { // RIP could have been updated after coming back from the Syscall. NewRIP = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, rip)); ExitFunction(NewRIP); } } void OpDispatchBuilder::ThunkOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); uint8_t* sha256 = (uint8_t*)(Op->PC + 2); if (Is64BitMode) { // x86-64 ABI puts the function argument in RDI Thunk(LoadGPRRegister(X86State::REG_RDI), *reinterpret_cast(sha256)); } else { // x86 fastcall ABI puts the function argument in ECX Thunk(LoadGPRRegister(X86State::REG_RCX), *reinterpret_cast(sha256)); } auto NewRIP = Pop(GPRSize); // Store the new RIP ExitFunction(NewRIP, BranchHint::Return); BlockSetRIP = true; } void OpDispatchBuilder::LEAOp(OpcodeArgs) { // LEA specifically ignores segment prefixes const auto SrcSize = OpSizeFromSrc(Op); const auto OpAddr = X86Tables::DecodeFlags::GetOpAddr(Op->Flags, 0); OpSize DstSize {}; if (Is64BitMode) { DstSize = OpAddr == X86Tables::DecodeFlags::FLAG_OPERAND_SIZE_LAST ? OpSize::i16Bit : OpAddr == X86Tables::DecodeFlags::FLAG_WIDENING_SIZE_LAST ? OpSize::i64Bit : OpSize::i32Bit; } else { DstSize = OpAddr == X86Tables::DecodeFlags::FLAG_OPERAND_SIZE_LAST ? OpSize::i16Bit : OpSize::i32Bit; } auto Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags, {.LoadData = false, .AllowUpperGarbage = SrcSize > DstSize}); StoreResultGPR_WithOpSize(Op, Op->Dest, Src, DstSize); } void OpDispatchBuilder::NOPOp(OpcodeArgs) {} void OpDispatchBuilder::RETOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP)); Ref NewRIP = Pop(GPRSize, SP); if (Op->OP == 0xC2) { auto Offset = LoadSourceGPR(Op, Op->Src[0], Op->Flags); SP = Add(GPRSize, SP, Offset); } // Store the new stack pointer StoreGPRRegister(X86State::REG_RSP, SP); // Store the new RIP ExitFunction(NewRIP, BranchHint::Return); BlockSetRIP = true; } /* stack contains: Size of each member is 64-bit, 32-bit, or 16-bit depending on operating size RIP CS EFLAGS RSP SS */ void OpDispatchBuilder::IRETOp(OpcodeArgs) { // Operand Size override unsupported! if ((Op->Flags & X86Tables::DecodeFlags::FLAG_OPERAND_SIZE) != 0) { LogMan::Msg::EFmt("IRET only implemented for 64bit and 32bit sizes"); DecodeFailure = true; return; } const auto GPRSize = GetGPROpSize(); Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP)); // RIP (64/32/16 bits) auto NewRIP = Pop(GPRSize, SP); // CS (lower 16 used) auto NewSegmentCS = Pop(GPRSize, SP); _StoreContextGPR(OpSize::i16Bit, NewSegmentCS, offsetof(FEXCore::Core::CPUState, cs_idx)); UpdatePrefixFromSegment(NewSegmentCS, FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX); // eflags (lower 16 used) SetPackedRFLAG(false, Pop(GPRSize, SP)); if (Is64BitMode) { // RSP and SS only happen in 64-bit mode or if this is a CPL mode jump! // FEX doesn't support a CPL mode switch, so don't need to worry about this on 32-bit StoreGPRRegister(X86State::REG_RSP, Pop(GPRSize, SP)); // ss auto NewSegmentSS = Pop(GPRSize, SP); _StoreContextGPR(OpSize::i16Bit, NewSegmentSS, offsetof(FEXCore::Core::CPUState, ss_idx)); UpdatePrefixFromSegment(NewSegmentSS, FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX); } else { // Store the stack in 32-bit mode StoreGPRRegister(X86State::REG_RSP, SP); } ExitFunction(NewRIP); BlockSetRIP = true; } void OpDispatchBuilder::CallbackReturnOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); // Store the new RIP _CallbackReturn(); auto NewRIP = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, rip)); // This ExitFunction won't actually get hit but needs to exist ExitFunction(NewRIP); BlockSetRIP = true; } void OpDispatchBuilder::SecondaryALUOp(OpcodeArgs) { FEXCore::IR::IROps IROp, AtomicIROp; #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_1) << 6) | (prefix) << 3 | (Reg)) switch (Op->OP) { case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 0): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 0): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 0): IROp = FEXCore::IR::IROps::OP_ADD; AtomicIROp = FEXCore::IR::IROps::OP_ATOMICFETCHADD; break; case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 1): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 1): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 1): IROp = FEXCore::IR::IROps::OP_OR; AtomicIROp = FEXCore::IR::IROps::OP_ATOMICFETCHOR; break; case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 4): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 4): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 4): IROp = FEXCore::IR::IROps::OP_ANDWITHFLAGS; AtomicIROp = FEXCore::IR::IROps::OP_ATOMICFETCHAND; break; case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 5): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 5): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 5): IROp = FEXCore::IR::IROps::OP_SUB; AtomicIROp = FEXCore::IR::IROps::OP_ATOMICFETCHSUB; break; case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x80), 6): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x81), 6): case OPD(FEXCore::X86Tables::TYPE_GROUP_1, OpToIndex(0x83), 6): IROp = FEXCore::IR::IROps::OP_XOR; AtomicIROp = FEXCore::IR::IROps::OP_ATOMICFETCHXOR; break; default: IROp = FEXCore::IR::IROps::OP_LAST; AtomicIROp = FEXCore::IR::IROps::OP_LAST; LogMan::Msg::EFmt("Unknown ALU Op: 0x{:x}", Op->OP); DecodeFailure = true; return; }; #undef OPD ALUOp(Op, IROp, AtomicIROp, 1); } void OpDispatchBuilder::ADCOp(OpcodeArgs, uint32_t SrcIndex) { // Calculate flags early. CalculateDeferredFlags(); Ref Src = LoadSourceGPR(Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromDst(Op); const auto OpSize = std::max(OpSize::i32Bit, Size); Ref Before {}; if (DestIsLockedMem(Op)) { auto ALUOp = IncrementByCarry(OpSize, Src); HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); Before = _AtomicFetchAdd(Size, ALUOp, DestMem); } else { Before = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); } Ref Result; if (!DestIsLockedMem(Op) && Op->Src[SrcIndex].IsLiteral() && Op->Src[SrcIndex].Literal() == 0 && Size >= OpSize::i32Bit) { HandleNZCV_RMW(); RectifyCarryInvert(true); Result = _AdcZeroWithFlags(OpSize, Before); SetRFLAG(Before); CalculatePF(Result); CFInverted = false; } else { Result = CalculateFlags_ADC(Size, Before, Src); } if (!DestIsLockedMem(Op)) { StoreResultGPR(Op, Result); } } void OpDispatchBuilder::SBBOp(OpcodeArgs, uint32_t SrcIndex) { // Calculate flags early. CalculateDeferredFlags(); Ref Src = LoadSourceGPR(Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromDst(Op); const auto OpSize = std::max(OpSize::i32Bit, Size); Ref Result {}; Ref Before {}; if (DestIsLockedMem(Op)) { HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); auto SrcPlusCF = IncrementByCarry(OpSize, Src); Before = _AtomicFetchSub(Size, SrcPlusCF, DestMem); } else { Before = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); } Result = CalculateFlags_SBB(Size, Before, Src); if (!DestIsLockedMem(Op)) { StoreResultGPR(Op, Result); } } void OpDispatchBuilder::SALCOp(OpcodeArgs) { CalculateDeferredFlags(); auto Result = NZCVSelect(OpSize::i32Bit, CondClass::UGE /* CF = 1 */, _InlineConstant(0xffffffff), _InlineConstant(0)); StoreResultGPR(Op, Result); } void OpDispatchBuilder::PUSHOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Push(Size, LoadSourceGPR(Op, Op->Src[0], Op->Flags)); } void OpDispatchBuilder::PUSHREGOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Push(Size, LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true})); } void OpDispatchBuilder::PUSHAOp(OpcodeArgs) { // 32bit only const auto Size = OpSizeFromSrc(Op); Ref OldSP = _Copy(LoadGPRRegister(X86State::REG_RSP)); Push(Size, LoadGPRRegister(X86State::REG_RAX)); Push(Size, LoadGPRRegister(X86State::REG_RCX)); Push(Size, LoadGPRRegister(X86State::REG_RDX)); Push(Size, LoadGPRRegister(X86State::REG_RBX)); Push(Size, OldSP); Push(Size, LoadGPRRegister(X86State::REG_RBP)); Push(Size, LoadGPRRegister(X86State::REG_RSI)); Push(Size, LoadGPRRegister(X86State::REG_RDI)); } void OpDispatchBuilder::PUSHSegmentOp(OpcodeArgs, uint32_t SegmentReg) { const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); Ref Src {}; if (!Is64BitMode) { switch (SegmentReg) { case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: { Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, es_idx)); break; } case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: { Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, cs_idx)); break; } case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: { Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, ss_idx)); break; } case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: { Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, ds_idx)); break; } case FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX: { Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, fs_idx)); break; } case FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX: { Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, gs_idx)); break; } default: FEX_UNREACHABLE; } } else { switch (SegmentReg) { case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, es_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, cs_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, ss_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, ds_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX: Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, fs_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX: Src = _LoadContextGPR(SrcSize, offsetof(FEXCore::Core::CPUState, gs_cached)); break; default: FEX_UNREACHABLE; } } // Store our value to the new stack location // AMD hardware zexts segment selector to 32bit // Intel hardware inserts segment selector Push(DstSize, Src); } void OpDispatchBuilder::POPOp(OpcodeArgs) { Ref Value = Pop(OpSizeFromSrc(Op)); StoreResultGPR(Op, Value); } void OpDispatchBuilder::POPAOp(OpcodeArgs) { // 32bit only const auto Size = OpSizeFromSrc(Op); Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP)); StoreGPRRegister(X86State::REG_RDI, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RSI, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RBP, Pop(Size, SP), Size); // Skip loading RSP because it'll be correct at the end SP = _RMWHandle(Add(OpSize::i64Bit, SP, IR::OpSizeToSize(Size))); StoreGPRRegister(X86State::REG_RBX, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RDX, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RCX, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RAX, Pop(Size, SP), Size); // Store the new stack pointer StoreGPRRegister(X86State::REG_RSP, SP); } void OpDispatchBuilder::POPSegmentOp(OpcodeArgs, uint32_t SegmentReg) { const auto SrcSize = OpSizeFromSrc(Op); const auto DstSize = OpSizeFromDst(Op); auto NewSegment = Pop(SrcSize); switch (SegmentReg) { case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: _StoreContextGPR(DstSize, NewSegment, offsetof(FEXCore::Core::CPUState, es_idx)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: _StoreContextGPR(DstSize, NewSegment, offsetof(FEXCore::Core::CPUState, cs_idx)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: // Unset the 'active' bit in the packed TF, skipping the single step exception after this instruction SetRFLAG(_And(OpSize::i32Bit, GetRFLAG(FEXCore::X86State::RFLAG_TF_RAW_LOC), Constant(1))); _StoreContextGPR(DstSize, NewSegment, offsetof(FEXCore::Core::CPUState, ss_idx)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: _StoreContextGPR(DstSize, NewSegment, offsetof(FEXCore::Core::CPUState, ds_idx)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX: _StoreContextGPR(DstSize, NewSegment, offsetof(FEXCore::Core::CPUState, fs_idx)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX: _StoreContextGPR(DstSize, NewSegment, offsetof(FEXCore::Core::CPUState, gs_idx)); break; default: break; // Do nothing } UpdatePrefixFromSegment(NewSegment, SegmentReg); } void OpDispatchBuilder::LEAVEOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto OperandSize = (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_OPERAND_SIZE) ? OpSize::i16Bit : GPRSize; // First we move RBP in to RSP and then behave effectively like a pop auto SP = _RMWHandle(LoadGPRRegister(X86State::REG_RBP)); auto NewGPR = Pop(OperandSize, SP); // Store the new stack pointer StoreGPRRegister(X86State::REG_RSP, SP, OperandSize); // Store what we loaded to RBP StoreGPRRegister(X86State::REG_RBP, NewGPR, OperandSize); } void OpDispatchBuilder::CALLOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); BlockSetRIP = true; // Call instruction only uses up to 32-bit signed displacement const int64_t TargetOffset = Op->Src[0].Literal(); const auto ConstantPC = GetRelocatedPC(Op); // Push the return address. Push(GPRSize, ConstantPC); if (TargetOffset != 0) { // Store the RIP const uint64_t NextRIP = Op->PC + Op->InstSize; ExitRelocatedPC(Op, TargetOffset, BranchHint::Call, ConstantPC, [&]() { auto CallReturnJumpTarget = JumpTargets.find(NextRIP); if (CallReturnJumpTarget != JumpTargets.end() && CallReturnJumpTarget->second.IsEntryPoint) { return CallReturnJumpTarget->second.BlockEntry; } return InvalidNode; }()); } else { NeedsBlockEnd = true; } } void OpDispatchBuilder::CALLAbsoluteOp(OpcodeArgs) { BlockSetRIP = true; const auto Size = OpSizeFromSrc(Op); Ref JMPPCOffset = LoadSourceGPR(Op, Op->Src[0], Op->Flags); // Push the return address. auto ConstantPC = GetRelocatedPC(Op); Push(Size, ConstantPC); // Store the RIP const uint64_t NextRIP = Op->PC + Op->InstSize; ExitFunction(JMPPCOffset, BranchHint::Call, ConstantPC, [&]() { auto CallReturnJumpTarget = JumpTargets.find(NextRIP); if (CallReturnJumpTarget != JumpTargets.end() && CallReturnJumpTarget->second.IsEntryPoint) { return CallReturnJumpTarget->second.BlockEntry; } return InvalidNode; }()); } std::optional OpDispatchBuilder::DecodeNZCVCondition(uint8_t OP) { switch (OP) { case 0x0: { // JO - Jump if OF == 1 return CondClass::FU; } case 0x1: { // JNO - Jump if OF == 0 return CondClass::FNU; } case 0x2: { // JC - Jump if CF == 1 return CFInverted ? CondClass::ULT : CondClass::UGE; } case 0x3: { // JNC - Jump if CF == 0 return CFInverted ? CondClass::UGE : CondClass::ULT; } case 0x4: { // JE - Jump if ZF == 1 return CondClass::EQ; } case 0x5: { // JNE - Jump if ZF == 0 return CondClass::NEQ; } case 0x6: { // JNA - Jump if CF == 1 || ZF == 1 // With CF, we want (C == 0 || Z == 1). By De Morgan's, that's // equivalent to !(C == 1 && Z == 0). That's .ls RectifyCarryInvert(true); return CondClass::ULE; } case 0x7: { // JA - Jump if CF == 0 && ZF == 0 // With CF inverted, we want (C == 1 && Z == 0). That's .hi RectifyCarryInvert(true); return CondClass::UGT; } case 0x8: { // JS - Jump if SF == 1 return CondClass::MI; } case 0x9: { // JNS - Jump if SF == 0 return CondClass::PL; } case 0xC: { // SF <> OF return CondClass::SLT; } case 0xD: { // SF = OF return CondClass::SGE; } case 0xE: { // ZF = 1 || SF <> OF return CondClass::SLE; } case 0xF: { // ZF = 0 && SF = OF return CondClass::SGT; } default: // Other conditions do not map directly, caller gets to deal with it. return std::nullopt; } } static bool ParityJumpIsJP(uint8_t OP) { LOGMAN_THROW_A_FMT(OP == 0xA || OP == 0xB, "JP or JNP"); return OP == 0xA; } Ref OpDispatchBuilder::SelectCC0All1(uint8_t OP) { if (auto Cond = DecodeNZCVCondition(OP); Cond) { // Use raw select since DecodeNZCVCondition handles the carry invert return _NZCVSelect(OpSize::i64Bit, *Cond, _InlineConstant(~0ULL), _InlineConstant(0)); } else { // Raw value contains inverted PF in bottom bit return _Sbfe(OpSize::i64Bit, 1, 0, LoadPFRaw(false, ParityJumpIsJP(OP))); } } void OpDispatchBuilder::SETccOp(OpcodeArgs) { CalculateDeferredFlags(); Ref SrcCond; if (auto Cond = DecodeNZCVCondition(Op->OP & 0xf); Cond) { // Use raw select since DecodeNZCVCondition handles the carry invert SrcCond = _NZCVSelect01(*Cond); } else { SrcCond = LoadPFRaw(true, ParityJumpIsJP(Op->OP & 0xf)); } StoreResultGPR(Op, SrcCond); } void OpDispatchBuilder::CMOVOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto OP = Op->OP & 0xF; const auto ResultSize = std::max(OpSize::i32Bit, OpSizeFromSrc(Op)); CalculateDeferredFlags(); // Destination is always a GPR. Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, GPRSize, Op->Flags); Ref Src {}, SrcCond {}; if (Op->Src[0].IsGPR()) { Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], GPRSize, Op->Flags); } else { Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); } if (auto Cond = DecodeNZCVCondition(OP); Cond) { // Use raw select since DecodeNZCVCondition handles the carry invert SrcCond = _NZCVSelect(ResultSize, *Cond, Src, Dest); } else { // Raw value contains inverted PF in bottom bit Ref Cmp = LoadPFRaw(false, ParityJumpIsJP(OP)); SaveNZCV(); // Because we're only clobbering NZCV internally, we ignore all carry flag // shenanigans and just use the raw test and raw select. _TestNZ(OpSize::i32Bit, Cmp, _InlineConstant(1)); SrcCond = _NZCVSelect(ResultSize, CondClass::NEQ, Src, Dest); } StoreResultGPR(Op, SrcCond); } void OpDispatchBuilder::CondJUMPOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); BlockSetRIP = true; // Jump instruction only uses up to 32-bit signed displacement int64_t TargetOffset = Op->Src[0].Literal(); uint64_t InstRIP = Op->PC + Op->InstSize; uint64_t Target = InstRIP + TargetOffset; if (GetGPROpSize() == OpSize::i32Bit) { // If the GPRSize is 4 then we need to be careful about PC wrapping if (TargetOffset < 0 && -TargetOffset > InstRIP) { // Invert the signed value if we are underflowing TargetOffset = 0x1'0000'0000ULL + TargetOffset; } else if (TargetOffset >= 0 && Target >= 0x1'0000'0000ULL) { // We are overflowing, wrap around TargetOffset = TargetOffset - 0x1'0000'0000ULL; } Target &= 0xFFFFFFFFU; } FlushRegisterCache(); auto TrueBlock = JumpTargets.find(Target); auto FalseBlock = JumpTargets.find(Op->PC + Op->InstSize); auto CurrentBlock = GetCurrentBlock(); { IRPair CondJump_; auto OP = Op->OP & 0xF; auto Cond = DecodeNZCVCondition(OP); if (Cond) { CondJump_ = CondJumpNZCV(*Cond); } else { LOGMAN_THROW_A_FMT(OP == 0xA || OP == 0xB, "only PF left"); CondJump_ = CondJumpBit(LoadPFRaw(false, false), 0, OP == 0xB); } // Taking branch block if (TrueBlock != JumpTargets.end()) { SetTrueJumpTarget(CondJump_, TrueBlock->second.BlockEntry); } else { // Make sure to start a new block after ending this one auto JumpTarget = CreateNewCodeBlockAtEnd(); SetTrueJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); // Store the new RIP ExitRelocatedPC(Op, TargetOffset); } // Failure to take branch if (FalseBlock != JumpTargets.end()) { SetFalseJumpTarget(CondJump_, FalseBlock->second.BlockEntry); } else { // Make sure to start a new block after ending this one // Place it after this block for fallthrough optimization auto JumpTarget = CreateNewCodeBlockAfter(CurrentBlock); SetFalseJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); // Leave block & store the new RIP ExitRelocatedPC(Op); } } } void OpDispatchBuilder::CondJUMPRCXOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); BlockSetRIP = true; auto JcxGPRSize = GetGPROpSize(); JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (JcxGPRSize >> 1) : JcxGPRSize; uint64_t Target = Op->PC + Op->InstSize + Op->Src[0].Literal(); Ref CondReg = LoadGPRRegister(X86State::REG_RCX, JcxGPRSize); auto TrueBlock = JumpTargets.find(Target); auto FalseBlock = JumpTargets.find(Op->PC + Op->InstSize); auto CurrentBlock = GetCurrentBlock(); { auto CondJump_ = CondJump(CondReg, CondClass::EQ); // Taking branch block if (TrueBlock != JumpTargets.end()) { SetTrueJumpTarget(CondJump_, TrueBlock->second.BlockEntry); } else { // Make sure to start a new block after ending this one auto JumpTarget = CreateNewCodeBlockAtEnd(); SetTrueJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); // Store the new RIP ExitRelocatedPC(Op, Op->Src[0].Literal()); } // Failure to take branch if (FalseBlock != JumpTargets.end()) { SetFalseJumpTarget(CondJump_, FalseBlock->second.BlockEntry); } else { // Make sure to start a new block after ending this one // Place it after the current block for fallthrough behavior auto JumpTarget = CreateNewCodeBlockAfter(CurrentBlock); SetFalseJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); // Leave block & store the new RIP ExitRelocatedPC(Op); } } } void OpDispatchBuilder::LoopOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); bool CheckZF = Op->OP != 0xE2; bool ZFTrue = Op->OP == 0xE1; BlockSetRIP = true; auto SrcSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? OpSize::i32Bit : OpSize::i64Bit; auto OpSize = SrcSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit; if (!Is64BitMode) { // RCX size is 32-bit or 16-bit when executing in 32-bit mode. SrcSize = IR::SizeToOpSize(IR::OpSizeToSize(SrcSize) >> 1); OpSize = OpSize::i32Bit; } uint64_t Target = Op->PC + Op->InstSize + Op->Src[1].Literal(); Ref CondReg = LoadSourceGPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); CondReg = Sub(OpSize, CondReg, 1); StoreResultGPR(Op, Op->Src[0], CondReg); // If LOOPE then jumps to target if RCX != 0 && ZF == 1 // If LOOPNE then jumps to target if RCX != 0 && ZF == 0 // // To handle efficiently, smash RCX to zero if ZF is wrong (1 csel). if (CheckZF) { const auto cond = ZFTrue ? CondClass::EQ : CondClass::NEQ; CondReg = NZCVSelect(OpSize, cond, CondReg, _InlineConstant(0)); } CalculateDeferredFlags(); auto TrueBlock = JumpTargets.find(Target); auto FalseBlock = JumpTargets.find(Op->PC + Op->InstSize); { auto CondJump_ = CondJump(CondReg); // Taking branch block if (TrueBlock != JumpTargets.end()) { SetTrueJumpTarget(CondJump_, TrueBlock->second.BlockEntry); } else { // Make sure to start a new block after ending this one auto JumpTarget = CreateNewCodeBlockAtEnd(); SetTrueJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); // Store the new RIP ExitRelocatedPC(Op, Op->Src[1].Literal()); } // Failure to take branch if (FalseBlock != JumpTargets.end()) { SetFalseJumpTarget(CondJump_, FalseBlock->second.BlockEntry); } else { // Make sure to start a new block after ending this one // Place after this block for fallthrough behavior auto JumpTarget = CreateNewCodeBlockAfter(GetCurrentBlock()); SetFalseJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); // Leave block & store the new RIP ExitRelocatedPC(Op); } } } void OpDispatchBuilder::JUMPOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); BlockSetRIP = true; // Jump instruction only uses up to 32-bit signed displacement int64_t TargetOffset = Op->Src[0].Literal(); uint64_t InstRIP = Op->PC + Op->InstSize; uint64_t TargetRIP = InstRIP + TargetOffset; if (GetGPROpSize() == OpSize::i32Bit) { // If the GPRSize is 4 then we need to be careful about PC wrapping if (TargetOffset < 0 && -TargetOffset > InstRIP) { // Invert the signed value if we are underflowing TargetOffset = 0x1'0000'0000ULL + TargetOffset; } else if (TargetOffset >= 0 && TargetRIP >= 0x1'0000'0000ULL) { // We are overflowing, wrap around TargetOffset = TargetOffset - 0x1'0000'0000ULL; } TargetRIP &= 0xFFFFFFFFU; } CalculateDeferredFlags(); // This is just an unconditional relative literal jump if (Multiblock) { auto JumpBlock = JumpTargets.find(TargetRIP); if (JumpBlock != JumpTargets.end()) { Jump(GetNewJumpBlock(TargetRIP)); } else { // If the block isn't a jump target then we need to create an exit block auto Jump_ = Jump(); // Place after this block for fallthrough behavior auto JumpTarget = CreateNewCodeBlockAfter(GetCurrentBlock()); SetJumpTarget(Jump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); ExitRelocatedPC(Op, TargetOffset); } } else { ExitRelocatedPC(Op, TargetOffset); } } void OpDispatchBuilder::JUMPAbsoluteOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); BlockSetRIP = true; // This is just an unconditional jump // This uses ModRM to determine its location // No way to use this effectively in multiblock auto RIPOffset = LoadSourceGPR(Op, Op->Src[0], Op->Flags); // Store the new RIP ExitFunction(RIPOffset); } void OpDispatchBuilder::JUMPFARIndirectOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); BlockSetRIP = true; // This is just an unconditional jump // This uses ModRM to determine its location // No way to use this effectively in multiblock Ref Src = MakeSegmentAddress(Op, Op->Dest); AddressMode SrcCS = {.Base = Src, .Offset = 4, .AddrSize = OpSize::i64Bit}; auto RIPOffset = _LoadMemGPRAutoTSO(OpSize::i32Bit, Src, OpSize::i8Bit); auto NewSegmentCS = _LoadMemGPRAutoTSO(OpSize::i16Bit, SrcCS, OpSize::i8Bit); // Set up the new CSSegment. _StoreContextGPR(OpSize::i16Bit, NewSegmentCS, offsetof(FEXCore::Core::CPUState, cs_idx)); UpdatePrefixFromSegment(NewSegmentCS, FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX); // Store the new RIP ExitFunction(RIPOffset); } void OpDispatchBuilder::CALLFARIndirectOp(OpcodeArgs) { const auto SrcSize = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REX_WIDENING ? OpSize::i64Bit : OpSize::i32Bit; // Calculate flags early. CalculateDeferredFlags(); BlockSetRIP = true; Ref Src = MakeSegmentAddress(Op, Op->Dest); AddressMode SrcCS = {.Base = Src, .Offset = 4, .AddrSize = OpSize::i64Bit}; auto RIPOffset = _LoadMemGPRAutoTSO(OpSize::i32Bit, Src, OpSize::i8Bit); auto NewSegmentCS = _LoadMemGPRAutoTSO(OpSize::i16Bit, SrcCS, OpSize::i8Bit); auto CurrentCS = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, cs_idx)); auto NewRIP = GetRelocatedPC(Op); // Push the current CS Push(SrcSize, CurrentCS); // Push the return address. Push(SrcSize, NewRIP); // Set up the new CSSegment. _StoreContextGPR(OpSize::i16Bit, NewSegmentCS, offsetof(FEXCore::Core::CPUState, cs_idx)); UpdatePrefixFromSegment(NewSegmentCS, FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX); // Store the new RIP ExitFunction(RIPOffset); } void OpDispatchBuilder::RETFARIndirectOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto SrcSize = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REX_WIDENING ? OpSize::i64Bit : OpSize::i32Bit; Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP)); Ref NewRIP = Pop(SrcSize, SP); Ref NewSegmentCS = Pop(SrcSize, SP); // Optional SP offset. if (Op->Src[0].IsLiteral()) { SP = Add(GPRSize, SP, Op->Src[0].Literal()); } // Store the new stack pointer StoreGPRRegister(X86State::REG_RSP, SP); _StoreContextGPR(OpSize::i16Bit, NewSegmentCS, offsetof(FEXCore::Core::CPUState, cs_idx)); UpdatePrefixFromSegment(NewSegmentCS, FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX); // Store the new RIP ExitFunction(NewRIP); BlockSetRIP = true; } void OpDispatchBuilder::TESTOp(OpcodeArgs, uint32_t SrcIndex) { // TEST is an instruction that does an AND between the sources // Result isn't stored in result, only writes to flags Ref Src = LoadSourceGPR(Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true}); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromDst(Op); LOGMAN_THROW_A_FMT(Size >= IR::OpSize::i8Bit && Size <= IR::OpSize::i64Bit, "Invalid size"); uint64_t Const; bool AlwaysNonnegative = false; if (IsValueConstant(WrapNode(Src), &Const)) { // Optimize out masking constants if (Const == (Size == OpSize::i64Bit ? ~0ULL : ((1ull << IR::OpSizeAsBits(Size)) - 1))) { Src = Dest; } // Optimize test with non-sign bits AlwaysNonnegative = (Const & (1ull << (IR::OpSizeAsBits(Size) - 1))) == 0; } if (Dest == Src) { // Optimize out the AND. SetNZP_ZeroCV(Size, Src); } else if (Size < OpSize::i32Bit && AlwaysNonnegative) { // If we know the result is always nonnegative, we can use a 32-bit test. auto Res = _And(OpSize::i32Bit, Dest, Src); CalculatePF(Res); SetNZ_ZeroCV(OpSize::i32Bit, Res); } else { HandleNZ00Write(); CalculatePF(_AndWithFlags(Size, Dest, Src)); } InvalidateAF(); } void OpDispatchBuilder::ARPLOp(OpcodeArgs) { // ARPL r/m16, r16 // If the RPL field in the destination selector is less privileged than the // RPL field in the source selector, then adjust destination RPL to match // source RPL and set ZF=1. Otherwise ZF=0 and destination is unchanged. // // Only ZF is modified by ARPL. constexpr auto Size = OpSize::i16Bit; Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, Size, Op->Flags, {.AllowUpperGarbage = true}); Ref Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], Size, Op->Flags, {.AllowUpperGarbage = true}); // RPL is the low two bits of the selector. Ref DestRPL = _Bfe(OpSize::i32Bit, 2, 0, Dest); Ref SrcRPL = _Bfe(OpSize::i32Bit, 2, 0, Src); // NeedUpdate is 1 when DestRPL < SrcRPL, else 0. Ref NeedUpdate = _Select(OpSize::i32Bit, OpSize::i32Bit, CondClass::ULT, DestRPL, SrcRPL, Constant(1), Constant(0)); SetRFLAG(NeedUpdate); // Compute adjusted destination selector: (Dest & ~3) | SrcRPL. auto NewDest = _Bfxil(OpSize::i32Bit, 2, 0, Dest, SrcRPL); // Conditionally select updated selector based on NeedUpdate. Ref FinalDest = _Select(OpSize::i32Bit, OpSize::i32Bit, CondClass::NEQ, NeedUpdate, Constant(0), NewDest, Dest); StoreResultGPR_WithOpSize(Op, Op->Dest, FinalDest, Size); } void OpDispatchBuilder::MOVSXDOp(OpcodeArgs) { // This instruction is a bit special // if SrcSize == 2 // Then lower 16 bits of destination is written without changing the upper 48 bits // else /* Size == 4 */ // if REX_WIDENING: // Sext(32, Src) // else // Zext(32, Src) // auto Size = std::min(OpSize::i32Bit, OpSizeFromSrc(Op)); bool Sext = (Size != OpSize::i16Bit) && Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REX_WIDENING; Ref Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], Size, Op->Flags, {.AllowUpperGarbage = Sext}); if (Size == OpSize::i16Bit) { // This'll make sure to insert in to the lower 16bits without modifying upper bits StoreResultGPR_WithOpSize(Op, Op->Dest, Src, Size); } else if (Sext) { // With REX.W then Sext Src = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Src); StoreResultGPR(Op, Src); } else { // Without REX.W then Zext (store result implicitly zero extends) StoreResultGPR(Op, Src); } } void OpDispatchBuilder::MOVSXOp(OpcodeArgs) { // Load garbage in upper bits, since we're sign extending anyway const auto Size = OpSizeFromSrc(Op); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); // Sign-extend to DstSize and zero-extend to the register size, using a fast // path for 32-bit dests where the native 32-bit Sbfe zero extends the top. const auto DstSize = OpSizeFromDst(Op); Src = _Sbfe(DstSize == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Src); StoreResultGPR(Op, Op->Dest, Src); } void OpDispatchBuilder::MOVZXOp(OpcodeArgs) { Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); // Store result implicitly zero extends StoreResultGPR(Op, Src); } void OpDispatchBuilder::CMPOp(OpcodeArgs, uint32_t SrcIndex) { // CMP is an instruction that does a SUB between the sources // Result isn't stored in result, only writes to flags Ref Src = LoadSourceGPR(Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true}); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); CalculateFlags_SUB(OpSizeFromSrc(Op), Dest, Src); } void OpDispatchBuilder::CQOOp(OpcodeArgs) { Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Size = OpSizeFromSrc(Op); Ref Upper = _Sbfe(std::max(OpSize::i32Bit, Size), 1, GetSrcBitSize(Op) - 1, Src); StoreResultGPR(Op, Upper); } void OpDispatchBuilder::XCHGOp(OpcodeArgs) { // Load both the source and the destination if (Op->OP == 0x90 && Op->Src[0].IsGPR() && Op->Src[0].Data.GPR.GPR == FEXCore::X86State::REG_RAX && Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR == FEXCore::X86State::REG_RAX) { // This is one heck of a sucky special case // If we are the 0x90 XCHG opcode (Meaning source is GPR RAX) // and destination register is ALSO RAX // and in this very specific case we are 32bit or above // Then this is a no-op // This is because 0x90 without a prefix is technically `xchg eax, eax` // But this would result in a zext on 64bit, which would ruin the no-op nature of the instruction // So x86-64 spec mandates this special case that even though it is a 32bit instruction and // is supposed to zext the result, it is a true no-op // // x86 spec text here: // // XCHG (E)AX, (E)AX (encoded instruction byte is 90H) is an alias for // NOP regardless of data size prefixes, including REX.W. // // Note that also includes 16-bit so we don't gate this on size. The // sequence (66 90) is a valid two-byte nop that we also ignore. if (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX) { // If this instruction has a REP prefix then this is architecturally // defined to be a `PAUSE` instruction. On older processors this ends up // being a true `REP NOP` which is why they stuck this here. _Yield(); } return; } // AllowUpperGarbage: OK to allow as it will be overwritten by StoreResult. Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); if (DestIsMem(Op)) { HandledLock = (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK) != 0; Ref Dest = MakeSegmentAddress(Op, Op->Dest); if (IsMonoBackpatcherBlock) { _MonoBackpatcherWrite(OpSizeFromSrc(Op), Src, Dest); } else { auto Result = _AtomicSwap(OpSizeFromSrc(Op), Src, Dest); StoreResultGPR(Op, Op->Src[0], Result); } } else { // AllowUpperGarbage: OK to allow as it will be overwritten by StoreResult. Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); // Swap the contents // Order matters here since we don't want to swap context contents for one that effects the other StoreResultGPR(Op, Op->Dest, Src); StoreResultGPR(Op, Op->Src[0], Dest); } } void OpDispatchBuilder::CDQOp(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); const auto SrcSize = DstSize / 2; Ref Src = LoadGPRRegister(X86State::REG_RAX, SrcSize, 0, true); Src = _Sbfe(DstSize <= OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, IR::OpSizeAsBits(SrcSize), 0, Src); StoreResultGPR_WithOpSize(Op, Op->Dest, Src, DstSize); } void OpDispatchBuilder::SAHFOp(OpcodeArgs) { // Extract AH Ref Src = LoadGPRRegister(X86State::REG_RAX, OpSize::i8Bit, 8); // Clear bits that aren't supposed to be set Src = _Andn(OpSize::i64Bit, Src, Constant(0b101000)); // Set the bit that is always set here Src = _Or(OpSize::i64Bit, Src, _InlineConstant(0b10)); // Store the lower 8 bits in to RFLAGS SetPackedRFLAG(true, Src); } void OpDispatchBuilder::LAHFOp(OpcodeArgs) { // Load the lower 8 bits of the Rflags register auto RFLAG = GetPackedRFLAG(0xFF); // Store the lower 8 bits of the rflags register in to AH StoreGPRRegister(X86State::REG_RAX, RFLAG, OpSize::i8Bit, 8); } void OpDispatchBuilder::FLAGControlOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); switch (Op->OP) { case 0xF5: // CMC CarryInvert(); break; case 0xF8: // CLC SetCFInverted(Constant(1)); break; case 0xF9: // STC SetCFInverted(Constant(0)); break; case 0xFC: // CLD // Transformed StoreDF(Constant(1)); break; case 0xFD: // STD StoreDF(Constant(-1)); break; } } void OpDispatchBuilder::MOVSegOp(OpcodeArgs, bool ToSeg) { // In x86-64 mode the accesses to the segment registers end up being constant zero moves // Aside from FS/GS // In x86-64 mode the accesses to segment registers can actually still touch the segments // These write to the selector portion of the register // // FS and GS are specially handled here though // AMD documentation is /wrong/ in this regard // AMD documentation claims that the MOV to SReg and POP SReg registers will load a 32bit // value in to the HIDDEN portions of the FS and GS registers /OR/ ignored if a null selector is // selected for the registers // This statement is actually untrue, the instructions will /actually/ load 16bits in to the selector portion of the register! // Tested on a Zen+ CPU, the selector is the portion that is modified! // We don't currently support FS/GS selector modifying, so this needs to be asserted out // The loads here also load the selector, NOT the base if (ToSeg) { Ref Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], OpSize::i16Bit, Op->Flags); switch (Op->Dest.Data.GPR.GPR) { case FEXCore::X86State::REG_RAX: // ES case FEXCore::X86State::REG_R8: // ES _StoreContextGPR(OpSize::i16Bit, Src, offsetof(FEXCore::Core::CPUState, es_idx)); UpdatePrefixFromSegment(Src, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX); break; case FEXCore::X86State::REG_RBX: // DS case FEXCore::X86State::REG_R11: // DS _StoreContextGPR(OpSize::i16Bit, Src, offsetof(FEXCore::Core::CPUState, ds_idx)); UpdatePrefixFromSegment(Src, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); break; case FEXCore::X86State::REG_RCX: // CS case FEXCore::X86State::REG_R9: // CS // CPL3 can't write to this Break(FEXCore::IR::BreakDefinition { .ErrorRegister = 0, .Signal = SIGILL, .TrapNumber = 0, .si_code = 0, }); break; case FEXCore::X86State::REG_RDX: // SS case FEXCore::X86State::REG_R10: // SS _StoreContextGPR(OpSize::i16Bit, Src, offsetof(FEXCore::Core::CPUState, ss_idx)); UpdatePrefixFromSegment(Src, FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX); break; case FEXCore::X86State::REG_RBP: // GS case FEXCore::X86State::REG_R13: // GS if (!Is64BitMode) { _StoreContextGPR(OpSize::i16Bit, Src, offsetof(FEXCore::Core::CPUState, gs_idx)); UpdatePrefixFromSegment(Src, FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX); } else { LogMan::Msg::EFmt("We don't support modifying GS selector in 64bit mode!"); DecodeFailure = true; } break; case FEXCore::X86State::REG_RSP: // FS case FEXCore::X86State::REG_R12: // FS if (!Is64BitMode) { _StoreContextGPR(OpSize::i16Bit, Src, offsetof(FEXCore::Core::CPUState, fs_idx)); UpdatePrefixFromSegment(Src, FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX); } else { LogMan::Msg::EFmt("We don't support modifying FS selector in 64bit mode!"); DecodeFailure = true; } break; default: UnimplementedOp(Op); return; } } else { Ref Segment {}; switch (Op->Src[0].Data.GPR.GPR) { case FEXCore::X86State::REG_RAX: // ES case FEXCore::X86State::REG_R8: // ES Segment = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, es_idx)); break; case FEXCore::X86State::REG_RBX: // DS case FEXCore::X86State::REG_R11: // DS Segment = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, ds_idx)); break; case FEXCore::X86State::REG_RCX: // CS case FEXCore::X86State::REG_R9: // CS Segment = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, cs_idx)); break; case FEXCore::X86State::REG_RDX: // SS case FEXCore::X86State::REG_R10: // SS Segment = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, ss_idx)); break; case FEXCore::X86State::REG_RBP: // GS case FEXCore::X86State::REG_R13: // GS if (Is64BitMode) { Segment = Constant(0); } else { Segment = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, gs_idx)); } break; case FEXCore::X86State::REG_RSP: // FS case FEXCore::X86State::REG_R12: // FS if (Is64BitMode) { Segment = Constant(0); } else { Segment = _LoadContextGPR(OpSize::i16Bit, offsetof(FEXCore::Core::CPUState, fs_idx)); } break; default: UnimplementedOp(Op); return; } if (DestIsMem(Op)) { // If the destination is memory then we always store 16-bits only StoreResultGPR_WithOpSize(Op, Op->Dest, Segment, OpSize::i16Bit); } else { // If the destination is a GPR then we follow register storing rules StoreResultGPR(Op, Segment); } } } void OpDispatchBuilder::MOVOffsetOp(OpcodeArgs) { switch (Op->OP) { case 0xA0: case 0xA1: { // Source is memory(literal) // Dest is GPR auto Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.ForceLoad = true}); StoreResultGPR(Op, Op->Dest, Src); break; } case 0xA2: case 0xA3: { // Source is GPR // Dest is memory(literal) Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); // This one is a bit special since the destination is a literal // So the destination gets stored in Src[1] StoreResultGPR(Op, Op->Src[1], Src); break; } } } void OpDispatchBuilder::CPUIDOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); Ref Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], GPRSize, Op->Flags); Ref Leaf = LoadGPRRegister(X86State::REG_RCX); Ref RAX = _AllocateGPR(false); Ref RBX = _AllocateGPR(false); Ref RCX = _AllocateGPR(false); Ref RDX = _AllocateGPR(false); _CPUID(Src, Leaf, RAX, RBX, RCX, RDX); StoreGPRRegister(X86State::REG_RAX, RAX); StoreGPRRegister(X86State::REG_RBX, RBX); StoreGPRRegister(X86State::REG_RCX, RCX); StoreGPRRegister(X86State::REG_RDX, RDX); } uint32_t OpDispatchBuilder::GetConstantShift(X86Tables::DecodedOp Op, bool Is1Bit) { if (Is1Bit) { return 1; } else { // x86 masks the shift by 0x3F or 0x1F depending on size of op const auto Size = OpSizeFromSrc(Op); uint64_t Mask = Size == OpSize::i64Bit ? 0x3F : 0x1F; return Op->Src[1].Literal() & Mask; } } void OpDispatchBuilder::XGetBVOp(OpcodeArgs) { Ref Function = LoadGPRRegister(X86State::REG_RCX); auto RAX = _AllocateGPR(false); auto RDX = _AllocateGPR(false); _XGetBV(Function, RAX, RDX); StoreGPRRegister(X86State::REG_RAX, RAX); StoreGPRRegister(X86State::REG_RDX, RDX); } void OpDispatchBuilder::SHLOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); auto Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); auto Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); Ref Result = _Lshl(Size == OpSize::i64Bit ? OpSize::i64Bit : OpSize::i32Bit, Dest, Src); HandleShift(Op, Result, Dest, ShiftType::LSL, Src); } void OpDispatchBuilder::SHLImmediateOp(OpcodeArgs, bool SHL1Bit) { Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); uint64_t Shift = GetConstantShift(Op, SHL1Bit); const auto Size = GetSrcBitSize(Op); Ref Result = _Lshl(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Constant(Shift)); CalculateFlags_ShiftLeftImmediate(OpSizeFromSrc(Op), Result, Dest, Shift); CalculateDeferredFlags(); StoreResultGPR(Op, Result); } void OpDispatchBuilder::SHROp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); auto Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= OpSize::i32Bit}); auto Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto ALUOp = _Lshr(std::max(OpSize::i32Bit, Size), Dest, Src); HandleShift(Op, ALUOp, Dest, ShiftType::LSR, Src); } void OpDispatchBuilder::SHRImmediateOp(OpcodeArgs, bool SHR1Bit) { const auto Size = GetSrcBitSize(Op); auto Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 32}); uint64_t Shift = GetConstantShift(Op, SHR1Bit); auto ALUOp = _Lshr(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Constant(Shift)); CalculateFlags_ShiftRightImmediate(OpSizeFromSrc(Op), ALUOp, Dest, Shift); CalculateDeferredFlags(); StoreResultGPR(Op, ALUOp); } void OpDispatchBuilder::SHLDOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); const auto Size = GetSrcBitSize(Op); // Allow garbage on the Src if it will be ignored by the Lshr below Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = Size >= 32}); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); // Allow garbage on the shift, we're masking it anyway. Ref Shift = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); // x86 masks the shift by 0x3F or 0x1F depending on size of op. if (Size == 64) { Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x3F)); } else { Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x1F)); } // a64 masks the bottom bits, so if we're using a native 32/64-bit shift, we // can negate to do the subtract (it's congruent), which saves a constant. auto ShiftRight = Size >= 32 ? _Neg(OpSize::i64Bit, Shift) : Sub(OpSize::i64Bit, Constant(Size), Shift); auto Tmp1 = _Lshl(OpSize::i64Bit, Dest, Shift); auto Tmp2 = _Lshr(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Src, ShiftRight); Ref Res = _Or(OpSize::i64Bit, Tmp1, Tmp2); // If shift count was zero then output doesn't change // Needs to be checked for the 32bit operand case // where shift = 0 and the source register still gets Zext // // TODO: With a backwards pass ahead-of-time, we could stick this in the // if(shift) used for flags. // // TODO: This whole function wants to be wrapped in the if. Maybe b/w pass is // a good idea after all. Res = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::EQ, Shift, Constant(0), Dest, Res); HandleShift(Op, Res, Dest, ShiftType::LSL, Shift); } void OpDispatchBuilder::SHLDImmediateOp(OpcodeArgs) { uint64_t Shift = GetConstantShift(Op, false); const auto Size = GetSrcBitSize(Op); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = Size >= 32}); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 32}); if (Shift != 0) { Ref Res {}; if (Size < 32) { Ref ShiftLeft = Constant(Shift); auto ShiftRight = Size - Shift; auto Tmp1 = _Lshl(OpSize::i64Bit, Dest, ShiftLeft); Ref Tmp2 = ShiftRight ? _Lshr(OpSize::i32Bit, Src, Constant(ShiftRight)) : Src; Res = _Or(OpSize::i64Bit, Tmp1, Tmp2); } else { // 32-bit and 64-bit SHLD behaves like an EXTR where the lower bits are filled from the source. Res = _Extr(OpSizeFromSrc(Op), Dest, Src, Size - Shift); } CalculateFlags_ShiftLeftImmediate(OpSizeFromSrc(Op), Res, Dest, Shift); CalculateDeferredFlags(); StoreResultGPR(Op, Res); } else if (Shift == 0 && Size == 32) { // Ensure Zext still occurs StoreResultGPR(Op, Dest); } } void OpDispatchBuilder::SHRDOp(OpcodeArgs) { // Calculate flags early. // This instruction conditionally generates flags so we need to insure sane state going in. CalculateDeferredFlags(); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); Ref Shift = LoadGPRRegister(X86State::REG_RCX); const auto Size = GetDstBitSize(Op); // x86 masks the shift by 0x3F or 0x1F depending on size of op if (Size == 64) { Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x3F)); } else { Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x1F)); } auto ShiftLeft = Sub(OpSize::i64Bit, Constant(Size), Shift); auto Tmp1 = _Lshr(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Shift); auto Tmp2 = _Lshl(OpSize::i64Bit, Src, ShiftLeft); Ref Res = _Or(OpSize::i64Bit, Tmp1, Tmp2); // If shift count was zero then output doesn't change // Needs to be checked for the 32bit operand case // where shift = 0 and the source register still gets Zext Res = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::EQ, Shift, Constant(0), Dest, Res); HandleShift(Op, Res, Dest, ShiftType::LSR, Shift); } void OpDispatchBuilder::SHRDImmediateOp(OpcodeArgs) { Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); uint64_t Shift = GetConstantShift(Op, false); const auto Size = GetSrcBitSize(Op); if (Shift != 0) { Ref Res {}; if (Size < 32) { Ref ShiftRight = Constant(Shift); auto ShiftLeft = Constant(Size - Shift); auto Tmp1 = _Lshr(OpSize::i32Bit, Dest, ShiftRight); auto Tmp2 = _Lshl(OpSize::i64Bit, Src, ShiftLeft); Res = _Or(OpSize::i64Bit, Tmp1, Tmp2); } else { // 32-bit and 64-bit SHRD behaves like an EXTR where the upper bits are filled from the source. Res = _Extr(OpSizeFromSrc(Op), Src, Dest, Shift); } StoreResultGPR(Op, Res); CalculateFlags_ShiftRightDoubleImmediate(OpSizeFromSrc(Op), Res, Dest, Shift); } else if (Shift == 0 && Size == 32) { // Ensure Zext still occurs StoreResultGPR(Op, Dest); } } void OpDispatchBuilder::ASHROp(OpcodeArgs, bool Immediate, bool SHR1Bit) { const auto Size = OpSizeFromSrc(Op); const auto OpSize = std::max(OpSize::i32Bit, OpSizeFromDst(Op)); // If Size < 4, then we Sbfe the Dest so we can have garbage. // Otherwise, if Size = Opsize, then both are 4 or 8 and match the a64 // semantics directly, so again we can have garbage. The only case where we // need zero-extension here is when the sizes mismatch. auto Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = (OpSize == Size) || (Size < OpSize::i32Bit)}); if (Size < OpSize::i32Bit) { Dest = _Sbfe(OpSize::i64Bit, IR::OpSizeAsBits(Size), 0, Dest); } if (Immediate) { uint64_t Shift = GetConstantShift(Op, SHR1Bit); Ref Result = _Ashr(OpSize, Dest, Constant(Shift)); CalculateFlags_SignShiftRightImmediate(OpSizeFromSrc(Op), Result, Dest, Shift); CalculateDeferredFlags(); StoreResultGPR(Op, Result); } else { auto Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); Ref Result = _Ashr(OpSize, Dest, Src); HandleShift(Op, Result, Dest, ShiftType::ASR, Src); } } void OpDispatchBuilder::RotateOp(OpcodeArgs, bool Left, bool IsImmediate, bool Is1Bit) { CalculateDeferredFlags(); const uint32_t Size = GetSrcBitSize(Op); const auto OpSize = Size == 64 ? OpSize::i64Bit : OpSize::i32Bit; uint64_t UnmaskedConst {}; // x86 masks the shift by 0x3F or 0x1F depending on size of op. But it's // equivalent to mask to the actual size of the op, that way we can bound // things tighter for 8-bit later in the function. uint64_t Mask = Size == 8 ? 7 : (Size == 64 ? 0x3F : 0x1F); ArithRef UnmaskedSrc; if (Is1Bit || IsImmediate) { UnmaskedConst = GetConstantShift(Op, Is1Bit); UnmaskedSrc = ARef(UnmaskedConst); } else { UnmaskedSrc = ARef(LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true})); } auto Src = UnmaskedSrc.And(Mask); // We fill the upper bits so we allow garbage on load. auto Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); if (Size < 32) { // ARM doesn't support 8/16bit rotates. Emulate with an insert // StoreResult truncates back to a 8/16 bit value Dest = _Bfi(OpSize, Size, Left ? (32 - Size) : Size, Dest, Dest); } // To rotate 64-bits left, right-rotate by (64 - Shift) = -Shift mod 64. auto Res = _Ror(OpSize, Dest, (Left ? Src.Neg() : Src).Ref()); StoreResultGPR(Op, Res); if (Is1Bit || IsImmediate) { if (UnmaskedSrc.C) { // Extract the last bit shifted in to CF SetCFDirect(Res, Left ? 0 : Size - 1, true); // For ROR, OF is the XOR of the new CF bit and the most significant bit of the result. // For ROL, OF is the LSB and MSB XOR'd together. // OF is architecturally only defined for 1-bit rotate. if (UnmaskedSrc.C == 1) { auto NewOF = _XorShift(OpSize, Res, Res, ShiftType::LSR, Left ? Size - 1 : 1); SetRFLAG(NewOF, Left ? 0 : Size - 2, true); } } } else { HandleNZCVWrite(); RectifyCarryInvert(true); // We deferred the masking for 8-bit to the flag section, do it here. if (Size == 8) { Src = UnmaskedSrc.And(0x1F); } _RotateFlags(OpSizeFromSrc(Op), Res, Src.Ref(), Left); } } void OpDispatchBuilder::ANDNBMIOp(OpcodeArgs) { auto* Src1 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Src2 = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto Dest = _Andn(OpSizeFromSrc(Op), Src2, Src1); StoreResultGPR(Op, Dest); CalculateFlags_Logical(OpSizeFromSrc(Op), Dest); } void OpDispatchBuilder::BEXTRBMIOp(OpcodeArgs) { // Essentially (Src1 >> Start) & ((1 << Length) - 1) // along with some edge-case handling and flag setting. LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); auto* Src1 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Src2 = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromSrc(Op); const auto SrcSize = IR::OpSizeAsBits(Size); const auto MaxSrcBit = SrcSize - 1; auto MaxSrcBitOp = Constant(MaxSrcBit); // Shift the operand down to the starting bit auto Start = _Bfe(OpSizeFromSrc(Op), 8, 0, Src2); auto Shifted = _Lshr(Size, Src1, Start); // Shifts larger than operand size need to be set to zero. auto SanitizedShifted = _Select(Size, Size, CondClass::ULE, Start, MaxSrcBitOp, Shifted, Constant(0)); // Now handle the length specifier. auto Length = _Bfe(Size, 8, 8, Src2); // Now build up the mask // (1 << Length) - 1 = ~(~0 << Length) auto AllOnes = Constant(~0ull); auto InvertedMask = _Lshl(Size, AllOnes, Length); // Now put it all together and make the result. auto Masked = _Andn(Size, SanitizedShifted, InvertedMask); // Sanitize the length. If it is above the max, we don't do the masking. auto Dest = _Select(Size, Size, CondClass::ULE, Length, MaxSrcBitOp, Masked, SanitizedShifted); // Finally store the result. StoreResultGPR(Op, Dest); // ZF is set properly. CF and OF are defined as being set to zero. SF, PF, and // AF are undefined. SetNZ_ZeroCV(GetOpSize(Dest), Dest); InvalidatePF_AF(); } void OpDispatchBuilder::BLSIBMIOp(OpcodeArgs) { // Equivalent to performing: SRC & -SRC LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); const auto Size = OpSizeFromSrc(Op); auto* Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto NegatedSrc = _Neg(Size, Src); auto Result = _And(Size, Src, NegatedSrc); StoreResultGPR(Op, Result); // CF is cleared if Src is zero, otherwise it's set. However, Src is zero iff // Result is zero, so we can test the result instead. So, CF is just the // inverted ZF. // // ZF/SF/OF set as usual. SetNZ_ZeroCV(Size, Result); InvalidatePF_AF(); SetCFInverted(GetRFLAG(X86State::RFLAG_ZF_RAW_LOC)); } void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) { // Equivalent to: (Src - 1) ^ Src LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); const auto Size = OpSizeFromSrc(Op); auto* Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _Xor(Size, Sub(Size, Src, 1), Src); StoreResultGPR(Op, Result); InvalidatePF_AF(); // CF set according to the Src auto CFInv = To01(OpSize::i64Bit, Src); // The output of BLSMSK is always nonzero, so TST will clear Z (along with C // and O) while setting S. SetNZ_ZeroCV(Size, Result); SetCFInverted(CFInv); } void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) { // Equivalent to: (Src - 1) & Src LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); const auto Size = OpSizeFromSrc(Op); auto* Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _And(Size, Sub(Size, Src, 1), Src); StoreResultGPR(Op, Result); auto CFInv = To01(OpSize::i64Bit, Src); SetNZ_ZeroCV(Size, Result); SetCFInverted(CFInv); InvalidatePF_AF(); } // Handles SARX, SHLX, and SHRX void OpDispatchBuilder::BMI2Shift(OpcodeArgs) { // In the event the source is a memory operand, use the // exact width instead of the GPR size. const auto GPRSize = GetGPROpSize(); const auto Size = OpSizeFromSrc(Op); const auto SrcSize = Op->Src[0].IsGPR() ? GPRSize : Size; auto* Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], SrcSize, Op->Flags); auto* Shift = LoadSourceGPR_WithOpSize(Op, Op->Src[1], GPRSize, Op->Flags, {.AllowUpperGarbage = true}); Ref Result; if (Op->OP == 0x6F7) { // SARX Result = _Ashr(Size, Src, Shift); } else if (Op->OP == 0x5F7) { // SHLX Result = _Lshl(Size, Src, Shift); } else { // SHRX Result = _Lshr(Size, Src, Shift); } StoreResultGPR(Op, Result); } void OpDispatchBuilder::BZHI(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); const auto OperandSize = IR::OpSizeAsBits(Size); // In 32-bit mode we only look at bottom 32-bit, no 8 or 16-bit BZHI so no // need to zero-extend sources auto* Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Index = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); // Clear the high bits specified by the index. A64 only considers bottom bits // of the shift, so we don't need to mask bottom 8-bits ourselves. // Out-of-bounds results ignored after. auto Mask = _Lshl(Size, Constant(-1), Index); auto MaskResult = _Andn(Size, Src, Mask); // If the index is above OperandSize, we don't clear anything. BZHI only // considers the bottom 8-bits, so we really want to know if the bottom 8-bits // have their top bits set. Test exactly that. // // Because we're clobbering flags internally we ignore all carry invert // shenanigans and use the raw versions here. _TestNZ(OpSize::i64Bit, Index, Constant(0xFF & ~(OperandSize - 1))); auto Result = _NZCVSelect(Size, CondClass::NEQ, Src, MaskResult); StoreResultGPR(Op, Result); auto CFInv = _NZCVSelect01(CondClass::EQ); InvalidatePF_AF(); SetNZ_ZeroCV(Size, Result); SetCFInverted(CFInv); } void OpDispatchBuilder::RORX(OpcodeArgs) { const auto SrcSize = OpSizeFromSrc(Op); const auto SrcSizeBits = IR::OpSizeAsBits(SrcSize); const auto Amount = Op->Src[1].Literal() & (SrcSizeBits - 1); const auto GPRSize = GetGPROpSize(); const auto DoRotation = Amount != 0 && Amount < SrcSizeBits; const auto IsSameGPR = Op->Src[0].IsGPR() && Op->Dest.IsGPR() && Op->Src[0].Data.GPR.GPR == Op->Dest.Data.GPR.GPR; const auto SrcSizeIsGPRSize = SrcSize == GPRSize; // If we don't need to rotate and our source is the same as the destination // then we don't need to do anything at all. We still need to be careful, // since 32-bit operations on 64-bit mode still need to zero-extend the // destination register. So also compare source size and GPR size. // // Very unlikely, but hey, we can do nothing faster. if (!DoRotation && IsSameGPR && SrcSizeIsGPRSize) [[unlikely]] { return; } auto* Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Result = Src; if (DoRotation) [[likely]] { Result = _Ror(OpSizeFromSrc(Op), Src, _InlineConstant(Amount)); } StoreResultGPR(Op, Result); } void OpDispatchBuilder::MULX(OpcodeArgs) { // RDX is the implied source operand in the instruction const auto OpSize = OpSizeFromSrc(Op); // Src1 can be a memory operand, so ensure we constrain to the // absolute width of the access in that scenario. const auto GPRSize = GetGPROpSize(); const auto Src1Size = Op->Src[1].IsGPR() ? GPRSize : OpSize; Ref Src1 = LoadSourceGPR_WithOpSize(Op, Op->Src[1], Src1Size, Op->Flags); Ref Src2 = LoadGPRRegister(X86State::REG_RDX, GPRSize); // As per the Intel Software Development Manual, if the destination and // first operand correspond to the same register, then the result // will be the high half of the multiplication result. if (Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) { Ref ResultHi = _UMulH(OpSize, Src1, Src2); StoreResultGPR(Op, Op->Dest, ResultHi); } else { Ref ResultLo = _UMul(OpSize, Src1, Src2); Ref ResultHi = _UMulH(OpSize, Src1, Src2); StoreResultGPR(Op, Op->Src[0], ResultLo); StoreResultGPR(Op, Op->Dest, ResultHi); } } void OpDispatchBuilder::PDEP(OpcodeArgs) { LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); auto* Input = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Mask = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask); StoreResultGPR(Op, Op->Dest, Result); } void OpDispatchBuilder::PEXT(OpcodeArgs) { LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed"); auto* Input = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Mask = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); auto Result = _PExt(OpSizeFromSrc(Op), Input, Mask); StoreResultGPR(Op, Op->Dest, Result); } void OpDispatchBuilder::ADXOp(OpcodeArgs) { const auto OpSize = OpSizeFromSrc(Op); // Only 32/64-bit anyway so allow garbage, we use 32-bit ops. auto* Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Before = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); // Handles ADCX and ADOX const bool IsADCX = Op->OP == 0x1F6; auto Zero = Constant(0); // Before we go trashing NZCV, save the current NZCV state. Ref OldNZCV = GetNZCV(); // We want to use arm64 adc. For ADOX, copy the overflow flag into CF. For // ADCX, we just rectify the carry. if (IsADCX) { RectifyCarryInvert(false); } else { // If overflow, 0 - 0 sets carry. Else, forces carry to 0. _CondSubNZCV(OpSize::i32Bit, Zero, Zero, CondClass::FU, 0x0 /* nzcv */); } // Do the actual add. HandleNZCV_RMW(); auto Result = _AdcWithFlags(OpSize, Src, Before); StoreResultGPR(Op, Result); // Now restore all flags except the one we're updating. if (CTX->HostFeatures.SupportsFlagM) { // For ADOX, we need to copy the new carry into the overflow flag. If carry is clear (ULT with uninverted // carry), 0 - 0 clears overflow. Else, force overflow on. if (!IsADCX) { _CondSubNZCV(OpSize::i32Bit, Zero, Zero, CondClass::ULT, 0x1 /* nzcV */); } _RmifNZCV(OldNZCV, 28, IsADCX ? 0xd /* NzcV */ : 0xe /* NZCv */); } else { // For either operation, insert the new flag into the old NZCV. bool SavedCFInvert = CFInverted; CFInverted = false; Ref OutputCF = GetRFLAG(X86State::RFLAG_CF_RAW_LOC, IsADCX); CFInverted = IsADCX ? true : SavedCFInvert; Ref NewNZCV = _Bfi(OpSize::i32Bit, 1, IsADCX ? 29 : 28, OldNZCV, OutputCF); SetNZCV(NewNZCV); } } void OpDispatchBuilder::RCROp1Bit(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); // We expliclty mask for <32-bit so allow garbage Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); const auto Size = GetSrcBitSize(Op); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); Ref Res; // Our new CF will be bit 0 of the source. Set upfront to avoid a move. SetCFDirect(Dest, 0, true); uint32_t Shift = 1; if (Size == 32 || Size == 64) { // Rotate and insert CF in the upper bit Res = _Extr(OpSizeFromSrc(Op), CF, Dest, Shift); } else { // Res = Src >> Shift Res = _Bfe(OpSize::i32Bit, Size - Shift, Shift, Dest); // inject the CF Res = _Orlshl(OpSize::i32Bit, Res, CF, Size - Shift); } StoreResultGPR(Op, Res); // OF is the top two MSBs XOR'd together // Only when Shift == 1, it is undefined otherwise SetRFLAG(_XorShift(OpSize::i64Bit, Res, Res, ShiftType::LSR, 1), Size - 2, true); } void OpDispatchBuilder::RCROp8x1Bit(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); const auto SizeBit = GetSrcBitSize(Op); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); // Our new CF will be bit (Shift - 1) of the source SetCFDirect(Dest, 0, true); // Rotate and insert CF in the upper bit Ref Res = _Bfe(OpSize::i32Bit, 7, 1, Dest); Res = _Bfi(OpSize::i32Bit, 1, 7, Res, CF); StoreResultGPR(Op, Res); // OF is the top two MSBs XOR'd together SetRFLAG(_XorShift(OpSize::i32Bit, Res, Res, ShiftType::LSR, 1), SizeBit - 2, true); } void OpDispatchBuilder::RCROp(OpcodeArgs) { const auto Size = GetSrcBitSize(Op); if (Size == 8 || Size == 16) { RCRSmallerOp(Op); return; } const auto Mask = (Size == 64) ? 0x3F : 0x1F; // Calculate flags early. CalculateDeferredFlags(); const auto OpSize = OpSizeFromSrc(Op); Ref Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); uint64_t Const; if (IsValueConstant(WrapNode(Src), &Const)) { Const &= Mask; if (!Const) { ZeroShiftResult(Op); return; } Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); // Res = Src >> Shift Ref Res = _Lshr(OpSize, Dest, Src); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); // Constant folded version of the above, with fused shifts. if (Const > 1) { Res = _Orlshl(OpSize, Res, Dest, Size + 1 - Const); } // Our new CF will be bit (Shift - 1) of the source. SetCFDirect(Dest, Const - 1, true); // Since shift != 0 we can inject the CF Res = _Orlshl(OpSize, Res, CF, Size - Const); // OF is the top two MSBs XOR'd together // Only when Shift == 1, it is undefined otherwise if (Const == 1) { auto Xor = _XorShift(OpSize, Res, Res, ShiftType::LSR, 1); SetRFLAG(Xor, Size - 2, true); } StoreResultGPR(Op, Res); return; } Ref SrcMasked = _And(OpSize, Src, _InlineConstant(Mask)); Calculate_ShiftVariable( Op, SrcMasked, [this, Op, Size, OpSize]() { // Rematerialize loads to avoid crossblock liveness Ref Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); // Res = Src >> Shift Ref Res = _Lshr(OpSize, Dest, Src); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); // Res |= (Dest << (Size - Shift + 1)); // Expressed as Res | ((Src << (Size - Shift)) << 1) to get correct // behaviour for Shift without clobbering NZCV. Then observe that modulo // Size, Size - Shift = -Shift so we can use a simple Neg. // // The masking of Lshl means we don't need mask the source, since: // // -(x & Mask) & Mask = (-x) & Mask Ref NegSrc = _Neg(OpSize, Src); Res = _Orlshl(OpSize, Res, _Lshl(OpSize, Dest, NegSrc), 1); // Our new CF will be bit (Shift - 1) of the source. this is hoisted up to // avoid the need to copy the source. Again, the Lshr absorbs the masking. auto NewCF = _Lshr(OpSize, Dest, Sub(OpSize, Src, 1)); SetCFDirect(NewCF, 0, true); // Since shift != 0 we can inject the CF Res = _Or(OpSize, Res, _Lshl(OpSize, CF, NegSrc)); // OF is the top two MSBs XOR'd together // Only when Shift == 1, it is undefined otherwise auto Xor = _XorShift(OpSize, Res, Res, ShiftType::LSR, 1); SetRFLAG(Xor, Size - 2, true); StoreResultGPR(Op, Res); }, OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt); } void OpDispatchBuilder::RCRSmallerOp(OpcodeArgs) { CalculateDeferredFlags(); const auto Size = GetSrcBitSize(Op); // x86 masks the shift by 0x3F or 0x1F depending on size of op auto Src = ARef(LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true})); Src = Src.And(0x1F); // CF only changes if we actually shifted. OF undefined if we didn't shift. // The result is unchanged if we didn't shift. So branch over the whole thing. Calculate_ShiftVariable(Op, Src.Ref(), [this, Op, Size]() { // Rematerialized to avoid crossblock liveness auto Src = ARef(LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true})); Src = Src.And(0x1F); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); Ref Tmp {}; // Insert the incoming value across the temporary 64bit source // Make sure to insert at + 1 offsets // We need to cover 32bits plus the amount that could rotate in if (Size == 8) { // 8-bit optimal cascade // Cascade: 0 // Data: -> [7:0] // CF: -> [8:8] // Cascade: 1 // Data: -> [16:9] // CF: -> [17:17] // Cascade: 2 // Data: -> [25:18] // CF: -> [26:26] // Cascade: 3 // Data: -> [34:27] // CF: -> [35:35] // Cascade: 4 // Data: -> [43:36] // CF: -> [44:44] // Insert CF, Destination already at [7:0] Tmp = _Bfi(OpSize::i64Bit, 1, 8, Dest, CF); // First Cascade, copies 9 bits from itself. Tmp = _Bfi(OpSize::i64Bit, 9, 9, Tmp, Tmp); // Second cascade, copies 18 bits from itself. Tmp = _Bfi(OpSize::i64Bit, 18, 18, Tmp, Tmp); // Final cascade, copies 9 bits again from itself. Tmp = _Bfi(OpSize::i64Bit, 9, 36, Tmp, Tmp); } else { // 16-bit optimal cascade // Cascade: 0 // Data: -> [15:0] // CF: -> [16:16] // Cascade: 1 // Data: -> [32:17] // CF: -> [33:33] // Cascade: 2 // Data: -> [49:34] // CF: -> [50:50] // Insert CF, Destination already at [15:0] Tmp = _Bfi(OpSize::i64Bit, 1, 16, Dest, CF); // First Cascade, copies 17 bits from itself. Tmp = _Bfi(OpSize::i64Bit, 17, 17, Tmp, Tmp); // Final Cascade, copies 17 bits from itself again. Tmp = _Bfi(OpSize::i64Bit, 17, 34, Tmp, Tmp); } // Entire bitfield has been setup. Just extract the 8 or 16bits we need. // 64-bit shift used because we want to rotate in our cascaded upper bits // rather than zeroes. Ref Res = _Lshr(OpSize::i64Bit, Tmp, Src.Ref()); StoreResultGPR(Op, Res); // Our new CF will be bit (Shift - 1) of the source. 32-bit Lshr masks the // same as x86, but if we constant fold we must mask ourselves. if (Src.IsConstant) { SetCFDirect(Tmp, (Src.C & 0x1f) - 1, true); } else { auto NewCF = _Lshr(OpSize::i32Bit, Tmp, Sub(OpSize::i32Bit, Src.Ref(), 1)); SetCFDirect(NewCF, 0, true); } // OF is the top two MSBs XOR'd together // Only when Shift == 1, it is undefined otherwise if (!Src.IsConstant || Src.C == 1) { auto NewOF = _XorShift(OpSize::i32Bit, Res, Res, ShiftType::LSR, 1); SetRFLAG(NewOF, Size - 2, true); } }); } void OpDispatchBuilder::RCLOp1Bit(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); const auto Size = GetSrcBitSize(Op); const auto OpSize = Size == 64 ? OpSize::i64Bit : OpSize::i32Bit; auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); // Rotate left and insert CF in to lowest bit // TODO: Use `adc Res, xzr, Dest, lsl 1` to save an instruction Ref Res = _Orlshl(OpSize, CF, Dest, 1); // Our new CF will be the top bit of the source SetCFDirect(Dest, Size - 1, true); // OF is the top two MSBs XOR'd together // Top two MSBs is CF and top bit of result SetRFLAG(_Xor(OpSize, Res, Dest), Size - 1, true); StoreResultGPR(Op, Res); } void OpDispatchBuilder::RCLOp(OpcodeArgs) { const auto Size = GetSrcBitSize(Op); if (Size == 8 || Size == 16) { RCLSmallerOp(Op); return; } const auto Mask = (Size == 64) ? 0x3F : 0x1F; // Calculate flags early. CalculateDeferredFlags(); Ref Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); const auto OpSize = OpSizeFromSrc(Op); uint64_t Const; if (IsValueConstant(WrapNode(Src), &Const)) { Const &= Mask; if (!Const) { ZeroShiftResult(Op); return; } // Res = Src << Shift Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Res = _Lshl(OpSize, Dest, Src); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); // Res |= (Src << (Size - Shift + 1)); if (Const > 1) { Res = _Orlshr(OpSize, Res, Dest, Size + 1 - Const); } // Our new CF will be bit (Shift - 1) of the source SetCFDirect(Dest, Size - Const, true); // Since Shift != 0 we can inject the CF Res = _Orlshl(OpSize, Res, CF, Const - 1); // OF is the top two MSBs XOR'd together // Only when Shift == 1, it is undefined otherwise if (Const == 1) { auto NewOF = _Xor(OpSize, Res, Dest); SetRFLAG(NewOF, Size - 1, true); } StoreResultGPR(Op, Res); return; } Ref SrcMasked = _And(OpSize, Src, _InlineConstant(Mask)); Calculate_ShiftVariable( Op, SrcMasked, [this, Op, Size, OpSize]() { // Rematerialized to avoid crossblock liveness Ref Src = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); // Res = Src << Shift Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Res = _Lshl(OpSize, Dest, Src); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); // Res |= (Dest >> (Size - Shift + 1)), expressed as // Res | ((Dest >> (-Shift)) >> 1), since Size - Shift = -Shift mod // Size. The shift aborbs the masking. auto NegSrc = _Neg(OpSize, Src); Res = _Orlshr(OpSize, Res, _Lshr(OpSize, Dest, NegSrc), 1); // Our new CF will be bit (Shift - 1) of the source auto NewCF = _Lshr(OpSize, Dest, NegSrc); SetCFDirect(NewCF, 0, true); // Since Shift != 0 we can inject the CF. Shift absorbs the masking. Ref CFShl = Sub(OpSize, Src, 1); auto TmpCF = _Lshl(OpSize, CF, CFShl); Res = _Or(OpSize, Res, TmpCF); // OF is the top two MSBs XOR'd together // Only when Shift == 1, it is undefined otherwise // // Note that NewCF has garbage in the upper bits, but we ignore them here // and mask as part of the set after. auto NewOF = _XorShift(OpSize, Res, NewCF, ShiftType::LSL, Size - 1); SetRFLAG(NewOF, Size - 1, true); StoreResultGPR(Op, Res); }, OpSizeFromSrc(Op) == OpSize::i32Bit ? std::make_optional(&OpDispatchBuilder::ZeroShiftResult) : std::nullopt); } void OpDispatchBuilder::RCLSmallerOp(OpcodeArgs) { CalculateDeferredFlags(); const auto Size = GetSrcBitSize(Op); // x86 masks the shift by 0x3F or 0x1F depending on size of op auto Src = ARef(LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true})); Src = Src.And(0x1F); // CF only changes if we actually shifted. OF undefined if we didn't shift. // The result is unchanged if we didn't shift. So branch over the whole thing. Calculate_ShiftVariable(Op, Src.Ref(), [this, Op, Size]() { // Rematerialized to avoid crossblock liveness auto Src = ARef(LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true})); Src = Src.And(0x1F); Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); Ref Tmp = Constant(0); for (size_t i = 0; i < (32 + Size + 1); i += (Size + 1)) { // Insert incoming value Tmp = _Bfi(OpSize::i64Bit, Size, 63 - i - Size, Tmp, Dest); // Insert CF Tmp = _Bfi(OpSize::i64Bit, 1, 63 - i, Tmp, CF); } // Insert incoming value Tmp = _Bfi(OpSize::i64Bit, Size, 0, Tmp, Dest); // The data is now set up like this // [Data][CF]:[Data][CF]:[Data][CF]:[Data][CF] // Shift 1 more bit that expected to get our result // Shifting to the right will now behave like a rotate to the left // Which we emulate with a _Ror Ref Res = _Ror(OpSize::i64Bit, Tmp, Src.Neg().Ref()); StoreResultGPR(Op, Res); // Our new CF is now at the bit position that we are shifting // Either 0 if CF hasn't changed (CF is living in bit 0) // or higher auto NewCF = _Ror(OpSize::i64Bit, Tmp, Src.Presub(63).Ref()); SetCFDirect(NewCF, 0, true); // OF is the XOR of the NewCF and the MSB of the result // Only defined for 1-bit rotates. if (!Src.IsConstant || Src.C == 1) { auto NewOF = _XorShift(OpSize::i64Bit, NewCF, Res, ShiftType::LSR, Size - 1); SetRFLAG(NewOF, 0, true); } }); } void OpDispatchBuilder::BTOp(OpcodeArgs, uint32_t SrcIndex, BTAction Action) { Ref Value; ArithRef Src; bool IsNonconstant = Op->Src[SrcIndex].IsGPR(); const uint32_t Size = GetDstBitSize(Op); const uint32_t Mask = Size - 1; if (IsNonconstant) { // Because we mask explicitly with And/Bfe/Sbfe after, we can allow garbage here. Src = ARef(LoadSourceGPR(Op, Op->Src[SrcIndex], Op->Flags, {.AllowUpperGarbage = true})); } else { // Can only be an immediate // Masked by operand size Src = ARef(Op->Src[SrcIndex].Literal() & Mask); } if (Op->Dest.IsGPR()) { // When the destination is a GPR, we don't care about garbage in the upper bits. // Load the full register. auto Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, GetGPROpSize(), Op->Flags); Value = Dest; // Get the bit selection from the src. We need to mask for 8/16-bit, but // rely on the implicit masking of Lshr for native sizes. unsigned LshrSize = std::max(IR::OpSizeToSize(OpSize::i32Bit), Size / 8); auto BitSelect = (Size == (LshrSize * 8)) ? Src : Src.And(Mask); auto LshrOpSize = IR::SizeToOpSize(LshrSize); // OF/SF/AF/PF undefined. ZF must be preserved. We choose to preserve OF/SF // too since we just use an rmif to insert into CF directly. We could // optimize perhaps. // // Set CF before the action to save a move, except for complements where we // can reuse the invert. if (Action != BTAction::BTComplement) { if (IsNonconstant) { Value = _Lshr(IR::SizeToOpSize(LshrSize), Value, BitSelect.Ref()); } SetRFLAG(Value, X86State::RFLAG_CF_RAW_LOC, Src.IsConstant ? Src.C : 0, true); CFInverted = false; } switch (Action) { case BTAction::BTNone: { /* Nothing to do */ break; } case BTAction::BTClear: { Dest = _Andn(LshrOpSize, Dest, BitSelect.MaskBit(LshrOpSize).Ref()); StoreResultGPR(Op, Dest); break; } case BTAction::BTSet: { Dest = _Or(LshrOpSize, Dest, BitSelect.MaskBit(LshrOpSize).Ref()); StoreResultGPR(Op, Dest); break; } case BTAction::BTComplement: { Dest = _Xor(LshrOpSize, Dest, BitSelect.MaskBit(LshrOpSize).Ref()); if (IsNonconstant) { Value = _Lshr(LshrOpSize, Dest, BitSelect.Ref()); } else { Value = Dest; } SetRFLAG(Value, X86State::RFLAG_CF_RAW_LOC, Src.IsConstant ? Src.C : 0, true); CFInverted = true; StoreResultGPR(Op, Dest); break; } } } else { // Load the address to the memory location Ref Dest = MakeSegmentAddress(Op, Op->Dest); // Get the bit selection from the src auto BitSelect = Src.Bfe(0, 3); // Address is provided as bits we want BYTE offsets // Extract Signed offset Src = Src.Sbfe(3, Size - 3); // Get the address offset by shifting out the size of the op (To shift out the bit selection) // Then use that to index in to the memory location by size of op AddressMode Address = {.Base = Dest, .Index = Src.Ref(), .AddrSize = OpSize::i64Bit}; switch (Action) { case BTAction::BTNone: { Value = _LoadMemGPRAutoTSO(OpSize::i8Bit, Address, OpSize::i8Bit); break; } case BTAction::BTClear: { Ref BitMask = BitSelect.MaskBit(OpSize::i64Bit).Ref(); if (DestIsLockedMem(Op)) { HandledLock = true; Value = _AtomicFetchCLR(OpSize::i8Bit, BitMask, LoadEffectiveAddress(this, Address, GetGPROpSize(), true)); } else { Value = _LoadMemGPRAutoTSO(OpSize::i8Bit, Address, OpSize::i8Bit); auto Modified = _Andn(OpSize::i64Bit, Value, BitMask); _StoreMemGPRAutoTSO(OpSize::i8Bit, Address, Modified, OpSize::i8Bit); } break; } case BTAction::BTSet: { Ref BitMask = BitSelect.MaskBit(OpSize::i64Bit).Ref(); if (DestIsLockedMem(Op)) { HandledLock = true; Value = _AtomicFetchOr(OpSize::i8Bit, BitMask, LoadEffectiveAddress(this, Address, GetGPROpSize(), true)); } else { Value = _LoadMemGPRAutoTSO(OpSize::i8Bit, Address, OpSize::i8Bit); auto Modified = _Or(OpSize::i64Bit, Value, BitMask); _StoreMemGPRAutoTSO(OpSize::i8Bit, Address, Modified, OpSize::i8Bit); } break; } case BTAction::BTComplement: { Ref BitMask = BitSelect.MaskBit(OpSize::i64Bit).Ref(); if (DestIsLockedMem(Op)) { HandledLock = true; Value = _AtomicFetchXor(OpSize::i8Bit, BitMask, LoadEffectiveAddress(this, Address, GetGPROpSize(), true)); } else { Value = _LoadMemGPRAutoTSO(OpSize::i8Bit, Address, OpSize::i8Bit); auto Modified = _Xor(OpSize::i64Bit, Value, BitMask); _StoreMemGPRAutoTSO(OpSize::i8Bit, Address, Modified, OpSize::i8Bit); } break; } } // Now shift in to the correct bit location if (!BitSelect.IsDefinitelyZero()) { Value = _Lshr(std::max(OpSize::i32Bit, GetOpSize(Value)), Value, BitSelect.Ref()); } // OF/SF/ZF/AF/PF undefined. SetCFDirect(Value, 0, true); } } void OpDispatchBuilder::IMUL1SrcOp(OpcodeArgs) { /* We're just going to sign-extend the non-garbage anyway.. */ Ref Src1 = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Src2 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromSrc(Op); const auto SizeBits = IR::OpSizeAsBits(Size); Ref Dest {}; Ref ResultHigh {}; switch (Size) { case OpSize::i8Bit: case OpSize::i16Bit: { Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1); Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2); Dest = _Mul(OpSize::i64Bit, Src1, Src2); ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest); break; } case OpSize::i32Bit: { ResultHigh = _SMull(Src1, Src2); ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh); // Flipped order to save a move Dest = _Mul(OpSize::i32Bit, Src1, Src2); break; } case OpSize::i64Bit: { ResultHigh = _MulH(OpSize::i64Bit, Src1, Src2); // Flipped order to save a move Dest = _Mul(OpSize::i64Bit, Src1, Src2); break; } default: FEX_UNREACHABLE; } StoreResultGPR(Op, Dest); CalculateFlags_MUL(Size, Dest, ResultHigh); } void OpDispatchBuilder::IMUL2SrcOp(OpcodeArgs) { Ref Src1 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); Ref Src2 = LoadSourceGPR(Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); const auto Size = OpSizeFromSrc(Op); const auto SizeBits = IR::OpSizeAsBits(Size); Ref Dest {}; Ref ResultHigh {}; switch (Size) { case OpSize::i8Bit: case OpSize::i16Bit: { Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1); Src2 = ARef(Src2).Sbfe(0, SizeBits).Ref(); Dest = _Mul(OpSize::i64Bit, Src1, Src2); ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, Dest); break; } case OpSize::i32Bit: { ResultHigh = _SMull(Src1, Src2); ResultHigh = _Sbfe(OpSize::i64Bit, SizeBits, SizeBits, ResultHigh); // Flipped order to save a move Dest = _Mul(OpSize::i32Bit, Src1, Src2); break; } case OpSize::i64Bit: { ResultHigh = _MulH(OpSize::i64Bit, Src1, Src2); // Flipped order to save a move Dest = _Mul(OpSize::i64Bit, Src1, Src2); break; } default: FEX_UNREACHABLE; } StoreResultGPR(Op, Dest); CalculateFlags_MUL(Size, Dest, ResultHigh); } void OpDispatchBuilder::IMULOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); const auto SizeBits = IR::OpSizeAsBits(Size); Ref Src1 = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Src2 = LoadGPRRegister(X86State::REG_RAX); if (Size != OpSize::i64Bit) { Src1 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src1); Src2 = _Sbfe(OpSize::i64Bit, SizeBits, 0, Src2); } // 64-bit special cased to save a move Ref Result {}; if (Size < OpSize::i64Bit) { Result = _Mul(OpSize::i64Bit, Src1, Src2); } Ref ResultHigh {}; if (Size == OpSize::i8Bit) { // Result is stored in AX StoreGPRRegister(X86State::REG_RAX, Result, OpSize::i16Bit); ResultHigh = _Sbfe(OpSize::i64Bit, 8, 8, Result); } else if (Size == OpSize::i16Bit) { // 16bits stored in AX // 16bits stored in DX StoreGPRRegister(X86State::REG_RAX, Result, Size); ResultHigh = _Sbfe(OpSize::i64Bit, 16, 16, Result); StoreGPRRegister(X86State::REG_RDX, ResultHigh, Size); } else if (Size == OpSize::i32Bit) { // 32bits stored in EAX // 32bits stored in EDX // Make sure they get Zext correctly auto LocalResult = _Bfe(OpSize::i64Bit, 32, 0, Result); auto LocalResultHigh = _Bfe(OpSize::i64Bit, 32, 32, Result); ResultHigh = _Sbfe(OpSize::i64Bit, 32, 32, Result); Result = _Sbfe(OpSize::i64Bit, 32, 0, Result); StoreGPRRegister(X86State::REG_RAX, LocalResult); StoreGPRRegister(X86State::REG_RDX, LocalResultHigh); } else if (Size == OpSize::i64Bit) { if (!Is64BitMode) { LogMan::Msg::EFmt("Doesn't exist in 32bit mode"); DecodeFailure = true; return; } // 64bits stored in RAX // 64bits stored in RDX ResultHigh = _MulH(OpSize::i64Bit, Src1, Src2); Result = _Mul(OpSize::i64Bit, Src1, Src2); StoreGPRRegister(X86State::REG_RAX, Result); StoreGPRRegister(X86State::REG_RDX, ResultHigh); } CalculateFlags_MUL(Size, Result, ResultHigh); } void OpDispatchBuilder::MULOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); const auto SizeBits = IR::OpSizeAsBits(Size); Ref Src1 = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Src2 = LoadGPRRegister(X86State::REG_RAX); Ref Result {}; if (Size != OpSize::i64Bit) { Src1 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src1); Src2 = _Bfe(OpSize::i64Bit, SizeBits, 0, Src2); Result = _UMul(OpSize::i64Bit, Src1, Src2); } Ref ResultHigh {}; if (Size == OpSize::i8Bit) { // Result is stored in AX StoreGPRRegister(X86State::REG_RAX, Result, OpSize::i16Bit); ResultHigh = _Bfe(OpSize::i64Bit, 8, 8, Result); } else if (Size == OpSize::i16Bit) { // 16bits stored in AX // 16bits stored in DX StoreGPRRegister(X86State::REG_RAX, Result, Size); ResultHigh = _Bfe(OpSize::i64Bit, 16, 16, Result); StoreGPRRegister(X86State::REG_RDX, ResultHigh, Size); } else if (Size == OpSize::i32Bit) { // 32bits stored in EAX // 32bits stored in EDX Ref ResultLow = _Bfe(OpSize::i64Bit, 32, 0, Result); ResultHigh = _Bfe(OpSize::i64Bit, 32, 32, Result); StoreGPRRegister(X86State::REG_RAX, ResultLow); StoreGPRRegister(X86State::REG_RDX, ResultHigh); } else if (Size == OpSize::i64Bit) { if (!Is64BitMode) { LogMan::Msg::EFmt("Doesn't exist in 32bit mode"); DecodeFailure = true; return; } // 64bits stored in RAX // 64bits stored in RDX // // Calculate high first to allow better RA. ResultHigh = _UMulH(OpSize::i64Bit, Src1, Src2); Result = _UMul(OpSize::i64Bit, Src1, Src2); StoreGPRRegister(X86State::REG_RAX, Result); StoreGPRRegister(X86State::REG_RDX, ResultHigh); } CalculateFlags_UMUL(ResultHigh); } void OpDispatchBuilder::NOTOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); const auto SizeBits = IR::OpSizeAsBits(Size); LOGMAN_THROW_A_FMT(Size >= IR::OpSize::i8Bit && Size <= IR::OpSize::i64Bit, "Invalid size"); Ref MaskConst {}; if (Size == OpSize::i64Bit) { MaskConst = Constant(~0ULL); } else { MaskConst = Constant((1ULL << SizeBits) - 1); } if (DestIsLockedMem(Op)) { HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); // Result unused _AtomicFetchXor(Size, MaskConst, DestMem); } else if (!Op->Dest.IsGPR()) { // GPR version plays fast and loose with sizes, be safe for memory tho. Ref Src = LoadSourceGPR(Op, Op->Dest, Op->Flags); Src = _Xor(OpSize::i64Bit, Src, MaskConst); StoreResultGPR(Op, Src); } else { // Specially handle high bits so we can invert in place with the correct // mask and a larger type. auto Dest = Op->Dest; if (Dest.Data.GPR.HighBits) { LOGMAN_THROW_A_FMT(Size == OpSize::i8Bit, "Only 8-bit GPRs get high bits"); MaskConst = Constant(0xFF00); Dest.Data.GPR.HighBits = false; } // Always load full size, we explicitly want the upper bits to get the // insert behaviour for free/implicitly. const auto GPRSize = GetGPROpSize(); Ref Src = LoadSourceGPR_WithOpSize(Op, Dest, GPRSize, Op->Flags); // For 8/16-bit, use 64-bit invert so we invert in place, while getting // insert behaviour. For 32-bit, use 32-bit invert to zero the upper bits. const auto EffectiveSize = Size == OpSize::i32Bit ? OpSize::i32Bit : GPRSize; // If we're inverting the whole thing, use Not instead of Xor to save a constant. if (Size >= OpSize::i32Bit) { Src = _Not(EffectiveSize, Src); } else { Src = _Xor(EffectiveSize, Src, MaskConst); } // Always store 64-bit, the Not/Xor correctly handle the upper bits and this // way we can delete the store. StoreResultGPR_WithOpSize(Op, Dest, Src, GPRSize); } } void OpDispatchBuilder::XADDOp(OpcodeArgs) { Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Ref Result; if (Op->Dest.IsGPR()) { // If this is a GPR then we can just do an Add Result = CalculateFlags_ADD(OpSizeFromSrc(Op), Dest, Src); // Previous value in dest gets stored in src StoreResultGPR(Op, Op->Src[0], Dest); // Calculated value gets stored in dst (order is important if dst is same as src) StoreResultGPR(Op, Result); } else { HandledLock = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK; Dest = AppendSegmentOffset(Dest, Op->Flags); auto Before = _AtomicFetchAdd(OpSizeFromSrc(Op), Src, Dest); CalculateFlags_ADD(OpSizeFromSrc(Op), Before, Src); StoreResultGPR(Op, Op->Src[0], Before); } } void OpDispatchBuilder::PopcountOp(OpcodeArgs) { Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = CTX->HostFeatures.SupportsCSSC || GetSrcSize(Op) >= 4}); Src = _Popcount(OpSizeFromSrc(Op), Src); StoreResultGPR(Op, Src); // We need to set ZF while clearing the rest of NZCV. The result of a popcount // is in the range [0, 63]. In particular, it is always positive. So a // combined NZ test will correctly zero SF/CF/OF while setting ZF. SetNZ_ZeroCV(OpSize::i32Bit, Src); ZeroPF_AF(); } Ref OpDispatchBuilder::CalculateAFForDecimal(Ref A) { auto Nibble = _And(OpSize::i64Bit, A, Constant(0xF)); auto Greater = Select01(OpSize::i64Bit, CondClass::UGT, Nibble, Constant(9)); return _Or(OpSize::i64Bit, LoadAF(), Greater); } void OpDispatchBuilder::DAAOp(OpcodeArgs) { CalculateDeferredFlags(); auto AL = LoadGPRRegister(X86State::REG_RAX, OpSize::i8Bit); auto CFInv = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC, true); auto AF = CalculateAFForDecimal(AL); // CF |= (AL > 0x99); CFInv = _And(OpSize::i64Bit, CFInv, Select01(OpSize::i64Bit, CondClass::ULE, AL, Constant(0x99))); // AL = AF ? (AL + 0x6) : AL; AL = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::NEQ, AF, Constant(0), Add(OpSize::i64Bit, AL, 0x6), AL); // AL = CF ? (AL + 0x60) : AL; AL = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::EQ, CFInv, Constant(0), Add(OpSize::i64Bit, AL, 0x60), AL); // SF, ZF, PF set according to result. CF set per above. OF undefined. StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit); SetNZ_ZeroCV(OpSize::i8Bit, AL); SetCFInverted(CFInv); CalculatePF(AL); SetAFAndFixup(AF); } void OpDispatchBuilder::DASOp(OpcodeArgs) { CalculateDeferredFlags(); auto AL = LoadGPRRegister(X86State::REG_RAX, OpSize::i8Bit); auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); auto AF = CalculateAFForDecimal(AL); // CF |= (AL > 0x99); CF = _Or(OpSize::i64Bit, CF, Select01(OpSize::i64Bit, CondClass::UGT, AL, Constant(0x99))); // NewCF = CF | (AF && (Borrow from AL - 6)) auto NewCF = _Or(OpSize::i32Bit, CF, _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::ULT, AL, Constant(6), AF, CF)); // AL = AF ? (AL - 0x6) : AL; AL = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::NEQ, AF, Constant(0), Sub(OpSize::i64Bit, AL, 0x6), AL); // AL = CF ? (AL - 0x60) : AL; AL = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::NEQ, CF, Constant(0), Sub(OpSize::i64Bit, AL, 0x60), AL); // SF, ZF, PF set according to result. CF set per above. OF undefined. StoreGPRRegister(X86State::REG_RAX, AL, OpSize::i8Bit); SetNZ_ZeroCV(OpSize::i8Bit, AL); SetCFDirect(NewCF); CalculatePF(AL); SetAFAndFixup(AF); } void OpDispatchBuilder::AAAOp(OpcodeArgs) { auto A = LoadGPRRegister(X86State::REG_RAX); auto AF = CalculateAFForDecimal(A); // CF = AF, OF/SF/ZF/PF undefined SetCFDirect_InvalidateNZV(AF); SetAFAndFixup(AF); CalculateDeferredFlags(); // AX = CF ? (AX + 0x106) : 0 A = NZCVSelect(OpSize::i32Bit, CondClass::UGE /* CF = 1 */, Add(OpSize::i32Bit, A, 0x106), A); // AL = AL & 0x0F A = _And(OpSize::i32Bit, A, Constant(0xFF0F)); StoreGPRRegister(X86State::REG_RAX, A, OpSize::i16Bit); } void OpDispatchBuilder::AASOp(OpcodeArgs) { auto A = LoadGPRRegister(X86State::REG_RAX); auto AF = CalculateAFForDecimal(A); // CF = AF, OF/SF/ZF/PF undefined SetCFDirect_InvalidateNZV(AF); SetAFAndFixup(AF); CalculateDeferredFlags(); // AX = CF ? (AX - 0x106) : 0 A = NZCVSelect(OpSize::i32Bit, CondClass::UGE /* CF = 1 */, Sub(OpSize::i32Bit, A, 0x106), A); // AL = AL & 0x0F A = _And(OpSize::i32Bit, A, Constant(0xFF0F)); StoreGPRRegister(X86State::REG_RAX, A, OpSize::i16Bit); } void OpDispatchBuilder::AAMOp(OpcodeArgs) { auto AL = LoadGPRRegister(X86State::REG_RAX, OpSize::i8Bit); auto Imm8 = Constant(Op->Src[0].Literal() & 0xFF); Ref Quotient = _AllocateGPR(true); Ref Remainder = _AllocateGPR(true); _UDiv(OpSize::i64Bit, AL, Invalid(), Imm8, Quotient, Remainder); auto Res = _AddShift(OpSize::i64Bit, Remainder, Quotient, ShiftType::LSL, 8); StoreGPRRegister(X86State::REG_RAX, Res, OpSize::i16Bit); SetNZ_ZeroCV(OpSize::i8Bit, Res); CalculatePF(Res); InvalidateAF(); } void OpDispatchBuilder::AADOp(OpcodeArgs) { auto A = LoadGPRRegister(X86State::REG_RAX); auto AH = _Lshr(OpSize::i32Bit, A, Constant(8)); auto Imm8 = Constant(Op->Src[0].Literal() & 0xFF); auto NewAL = Add(OpSize::i64Bit, A, _Mul(OpSize::i64Bit, AH, Imm8)); auto Result = _And(OpSize::i64Bit, NewAL, Constant(0xFF)); StoreGPRRegister(X86State::REG_RAX, Result, OpSize::i16Bit); SetNZ_ZeroCV(OpSize::i8Bit, Result); CalculatePF(Result); InvalidateAF(); } void OpDispatchBuilder::XLATOp(OpcodeArgs) { Ref Src = MakeSegmentAddress(X86State::REG_RBX, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); Ref Offset = LoadGPRRegister(X86State::REG_RAX, OpSize::i8Bit); AddressMode A = {.Base = Src, .Index = Offset, .AddrSize = OpSize::i64Bit}; auto Res = _LoadMemGPRAutoTSO(OpSize::i8Bit, A, OpSize::i8Bit); StoreGPRRegister(X86State::REG_RAX, Res, OpSize::i8Bit); } void OpDispatchBuilder::ReadSegmentReg(OpcodeArgs, OpDispatchBuilder::Segment Seg) { // 64-bit only // Doesn't hit the segment register optimization const auto Size = OpSizeFromSrc(Op); Ref Src {}; if (Seg == Segment::FS) { Src = _LoadContextGPR(Size, offsetof(FEXCore::Core::CPUState, fs_cached)); } else { Src = _LoadContextGPR(Size, offsetof(FEXCore::Core::CPUState, gs_cached)); } StoreResultGPR(Op, Src); } void OpDispatchBuilder::WriteSegmentReg(OpcodeArgs, OpDispatchBuilder::Segment Seg) { // Documentation claims that the 32-bit version of this instruction inserts in to the lower 32-bits of the segment // This is incorrect and it instead zero extends the 32-bit value to 64-bit const auto Size = OpSizeFromDst(Op); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); if (Seg == Segment::FS) { _StoreContextGPR(Size, Src, offsetof(FEXCore::Core::CPUState, fs_cached)); } else { _StoreContextGPR(Size, Src, offsetof(FEXCore::Core::CPUState, gs_cached)); } } void OpDispatchBuilder::EnterOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto OperandSize = (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_OPERAND_SIZE) ? OpSize::i16Bit : GPRSize; const uint64_t Value = Op->Src[0].Literal(); const uint16_t AllocSpace = Value & 0xFFFF; const uint8_t Level = (Value >> 16) & 0x1F; const auto PushValue = [&](IR::OpSize Size, Ref Src) -> Ref { auto OldSP = LoadGPRRegister(X86State::REG_RSP); auto NewSP = _Push(GPRSize, Size, Src, OldSP); // Store the new stack pointer StoreGPRRegister(X86State::REG_RSP, NewSP); return NewSP; }; auto OldBP = LoadGPRRegister(X86State::REG_RBP); auto NewSP = PushValue(OperandSize, OldBP); auto temp_RBP = NewSP; if (Level > 0) { for (uint8_t i = 1; i < Level; ++i) { auto MemLoc = Sub(GPRSize, OldBP, i * IR::OpSizeToSize(OperandSize)); auto Mem = _LoadMemGPR(OperandSize, MemLoc, OperandSize); NewSP = PushValue(OperandSize, Mem); } NewSP = PushValue(OperandSize, temp_RBP); } NewSP = Sub(GPRSize, NewSP, AllocSpace); StoreGPRRegister(X86State::REG_RSP, NewSP); StoreGPRRegister(X86State::REG_RBP, temp_RBP); } void OpDispatchBuilder::SGDTOp(OpcodeArgs) { auto DestAddress = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); // Store an emulated value in the format of: // uint16_t Limit; // {uint32_t,uint64_t} Base; // // Limit is always 0 // Base is always in kernel space at: 0xFFFFFFFFFFFE0000ULL // // Operand size prefix is ignored on this instruction, size purely depends on operating mode. uint64_t GDTAddress = 0xFFFFFFFFFFFE0000ULL; auto GDTStoreSize = OpSize::i64Bit; if (!Is64BitMode) { // Mask off upper bits if 32-bit result. GDTAddress &= ~0U; GDTStoreSize = OpSize::i32Bit; } _StoreMemGPRAutoTSO(OpSize::i16Bit, DestAddress, Constant(0)); _StoreMemGPRAutoTSO(GDTStoreSize, AddressMode {.Base = DestAddress, .Offset = 2, .AddrSize = OpSize::i64Bit}, Constant(GDTAddress)); } void OpDispatchBuilder::SIDTOp(OpcodeArgs) { auto DestAddress = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.LoadData = false}); // See SGDTOp, matches Linux in reported values uint64_t IDTAddress = 0xFFFFFE0000000000ULL; auto IDTStoreSize = OpSize::i64Bit; if (!Is64BitMode) { // Mask off upper bits if 32-bit result. IDTAddress &= ~0U; IDTStoreSize = OpSize::i32Bit; } _StoreMemGPRAutoTSO(OpSize::i16Bit, DestAddress, Constant(0xfff)); _StoreMemGPRAutoTSO(IDTStoreSize, AddressMode {.Base = DestAddress, .Offset = 2, .AddrSize = OpSize::i64Bit}, Constant(IDTAddress)); } void OpDispatchBuilder::SMSWOp(OpcodeArgs) { const bool IsMemDst = DestIsMem(Op); IR::OpSize DstSize {OpSize::iInvalid}; Ref Const = Constant((1U << 31) | ///< PG - Paging (0U << 30) | ///< CD - Cache Disable (0U << 29) | ///< NW - Not Writethrough (Legacy, now ignored) ///< [28:19] - Reserved (1U << 18) | ///< AM - Alignment Mask ///< 17 - Reserved (1U << 16) | ///< WP - Write Protect ///< [15:6] - Reserved (1U << 5) | ///< NE - Numeric Error (1U << 4) | ///< ET - Extension Type (Legacy, now reserved and 1) (0U << 3) | ///< TS - Task Switched (0U << 2) | ///< EM - Emulation (1U << 1) | ///< MP - Monitor Coprocessor (1U << 0)); ///< PE - Protection Enabled const auto OpAddr = X86Tables::DecodeFlags::GetOpAddr(Op->Flags, 0); if (Is64BitMode) { DstSize = OpAddr == X86Tables::DecodeFlags::FLAG_OPERAND_SIZE_LAST ? OpSize::i16Bit : OpAddr == X86Tables::DecodeFlags::FLAG_WIDENING_SIZE_LAST ? OpSize::i64Bit : OpSize::i32Bit; if (!IsMemDst && DstSize == OpSize::i32Bit) { // Special-case version of `smsw ebx`. This instruction does an insert in to the lower 32-bits on 64-bit hosts. // Override and insert. auto Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, GetGPROpSize(), Op->Flags); Const = _Bfi(OpSize::i64Bit, 32, 0, Dest, Const); DstSize = OpSize::i64Bit; } } else { DstSize = OpAddr == X86Tables::DecodeFlags::FLAG_OPERAND_SIZE_LAST ? OpSize::i16Bit : OpSize::i32Bit; } if (IsMemDst) { // Memory destinatino always writes only 16-bits. DstSize = OpSize::i16Bit; } StoreResultGPR_WithOpSize(Op, Op->Dest, Const, DstSize); } OpDispatchBuilder::CycleCounterPair OpDispatchBuilder::CycleCounter(bool SelfSynchronizingLoads) { Ref CounterLow {}; Ref CounterHigh {}; auto Counter = _CycleCounter(SelfSynchronizingLoads); if (CTX->Config.TSCScale) { CounterLow = _Lshl(OpSize::i32Bit, Counter, Constant(CTX->Config.TSCScale)); CounterHigh = _Lshr(OpSize::i64Bit, Counter, Constant(32 - CTX->Config.TSCScale)); } else { CounterLow = _Bfe(OpSize::i64Bit, 32, 0, Counter); CounterHigh = _Bfe(OpSize::i64Bit, 32, 32, Counter); } return { .CounterLow = CounterLow, .CounterHigh = CounterHigh, }; } void OpDispatchBuilder::RDTSCOp(OpcodeArgs) { auto Counter = CycleCounter(false); StoreGPRRegister(X86State::REG_RAX, Counter.CounterLow); StoreGPRRegister(X86State::REG_RDX, Counter.CounterHigh); } void OpDispatchBuilder::INCOp(OpcodeArgs) { Ref Dest; Ref Result; const auto Size = GetSrcBitSize(Op); const bool IsLocked = DestIsLockedMem(Op); if (IsLocked) { HandledLock = true; Ref DestAddress = MakeSegmentAddress(Op, Op->Dest); Dest = _AtomicFetchAdd(OpSizeFromSrc(Op), Constant(1), DestAddress); } else { Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 32}); } CalculateDeferredFlags(); if (Size < 32 && CTX->HostFeatures.SupportsFlagM) { // Addition producing upper garbage Result = Add(OpSize::i32Bit, Dest, 1); CalculatePF(Result); CalculateAF(Dest, Constant(1)); // Correctly set NZ flags, preserving C HandleNZCV_RMW(); _SetSmallNZV(OpSizeFromSrc(Op), Result); // Fix up V flag. INC overflows only when incrementing a positive and // getting a negative. So compare the sign bits to calculate V. _RmifNZCV(_Andn(OpSize::i32Bit, Result, Dest), Size - 1, 1); } else { Result = CalculateFlags_ADD(OpSizeFromSrc(Op), Dest, Constant(1), false); } if (!IsLocked) { StoreResultGPR(Op, Result); } } void OpDispatchBuilder::DECOp(OpcodeArgs) { Ref Dest; Ref Result; const auto Size = GetSrcBitSize(Op); const bool IsLocked = DestIsLockedMem(Op); if (IsLocked) { HandledLock = true; Ref DestAddress = MakeSegmentAddress(Op, Op->Dest); // Use Add instead of Sub to avoid a NEG Dest = _AtomicFetchAdd(OpSizeFromSrc(Op), Constant(Size == 64 ? -1 : ((1ULL << Size) - 1)), DestAddress); } else { Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= 32}); } CalculateDeferredFlags(); if (Size < 32 && CTX->HostFeatures.SupportsFlagM) { // Subtraction producing upper garbage Result = Sub(OpSize::i32Bit, Dest, 1); CalculatePF(Result); CalculateAF(Dest, Constant(1)); // Correctly set NZ flags, preserving C HandleNZCV_RMW(); _SetSmallNZV(OpSizeFromSrc(Op), Result); // Fix up V flag. DEC overflows only when decrementing a negative and // getting a positive. So compare the sign bits to calculate V. _RmifNZCV(_Andn(OpSize::i32Bit, Dest, Result), Size - 1, 1); } else { Result = CalculateFlags_SUB(OpSizeFromSrc(Op), Dest, Constant(1), false); } if (!IsLocked) { StoreResultGPR(Op, Result); } } void OpDispatchBuilder::STOSOp(OpcodeArgs) { if (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) { LogMan::Msg::EFmt("STOSOp: Can't handle address size override (OP: 0x{:04X}, Flags: 0x{:08X})", Op->OP, Op->Flags); DecodeFailure = true; return; } const auto Size = OpSizeFromSrc(Op); const bool Repeat = (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) != 0; if (!Repeat) { // Src is used only for a store of the same size so allow garbage Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); // Only ES prefix Ref Dest = MakeSegmentAddress(X86State::REG_RDI, 0, X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); // Store to memory where RDI points if (CTX->IsMemcpyAtomicTSOEnabled()) { _StoreMemGPRAutoTSO(Size, Dest, Src, Size); } else { _StoreMem(RegClass::GPR, Size, Src, Dest, Invalid(), OpSize::i8Bit, MemOffsetType::SXTX, 1); } // Offset the pointer Ref TailDest = LoadGPRRegister(X86State::REG_RDI); StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, IR::OpSizeToSize(Size))); } else { // FEX doesn't support partial faulting REP instructions. // Converting this to a `MemSet` IR op optimizes this quite significantly in our codegen. // If FEX is to gain support for faulting REP instructions, then this implementation needs to change significantly. Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags); Ref Dest = LoadGPRRegister(X86State::REG_RDI); // Only ES prefix auto Segment = GetSegment(0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); Ref Counter = LoadGPRRegister(X86State::REG_RCX); auto Result = _MemSet(CTX->IsAtomicTSOEnabled(), Size, Segment ?: InvalidNode, Dest, Src, Counter, LoadDir(1)); StoreGPRRegister(X86State::REG_RCX, Constant(0)); StoreGPRRegister(X86State::REG_RDI, Result); } } void OpDispatchBuilder::MOVSOp(OpcodeArgs) { if (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) { LogMan::Msg::EFmt("MOVSOp: Can't handle address size override (OP: 0x{:04X}, Flags: 0x{:08X})", Op->OP, Op->Flags); DecodeFailure = true; return; } // RA now can handle these to be here, to avoid DF accesses const auto Size = OpSizeFromSrc(Op); if (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) { auto SrcAddr = LoadGPRRegister(X86State::REG_RSI); auto DstAddr = LoadGPRRegister(X86State::REG_RDI); auto Counter = LoadGPRRegister(X86State::REG_RCX); auto DstSegment = GetSegment(0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); auto SrcSegment = GetSegment(Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); if (DstSegment) { DstAddr = Add(OpSize::i64Bit, DstAddr, DstSegment); } if (SrcSegment) { SrcAddr = Add(OpSize::i64Bit, SrcAddr, SrcSegment); } Ref Result_Src = _AllocateGPR(false); Ref Result_Dst = _AllocateGPR(false); _MemCpy(CTX->IsAtomicTSOEnabled(), Size, DstAddr, SrcAddr, Counter, LoadDir(1), Result_Dst, Result_Src); if (DstSegment) { Result_Dst = Sub(OpSize::i64Bit, Result_Dst, DstSegment); } if (SrcSegment) { Result_Src = Sub(OpSize::i64Bit, Result_Src, SrcSegment); } StoreGPRRegister(X86State::REG_RCX, Constant(0)); StoreGPRRegister(X86State::REG_RDI, Result_Dst); StoreGPRRegister(X86State::REG_RSI, Result_Src); } else { Ref RSI = MakeSegmentAddress(X86State::REG_RSI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); Ref RDI = MakeSegmentAddress(X86State::REG_RDI, 0, X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); if (CTX->IsMemcpyAtomicTSOEnabled()) { auto Src = _LoadMemGPRAutoTSO(Size, RSI, Size); // Store to memory where RDI points _StoreMemGPRAutoTSO(Size, RDI, Src, Size); } else { auto Src = _LoadMem(RegClass::GPR, Size, RSI, Invalid(), OpSize::i8Bit, MemOffsetType::SXTX, 1); _StoreMem(RegClass::GPR, Size, Src, RDI, Invalid(), OpSize::i8Bit, MemOffsetType::SXTX, 1); } RSI = OffsetByDir(RSI, IR::OpSizeToSize(Size)); RDI = OffsetByDir(RDI, IR::OpSizeToSize(Size)); StoreGPRRegister(X86State::REG_RSI, RSI); StoreGPRRegister(X86State::REG_RDI, RDI); } } IR::OpSize OpDispatchBuilder::GetStringOpSize(X86Tables::DecodedOp Op) const { LOGMAN_THROW_A_FMT(Is64BitMode || !(Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE), "Invalid modifier on 32bit address"); return !Is64BitMode || (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? OpSize::i32Bit : OpSize::i64Bit; } void OpDispatchBuilder::CMPSOp(OpcodeArgs) { if (!Is64BitMode && (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE)) { LogMan::Msg::EFmt("CMPSOp: Address size override (0x67) not supported in 32-bit mode (OP: 0x{:04X}).", Op->OP); DecodeFailure = true; return; } const auto Size = OpSizeFromSrc(Op); OpSize AddrSize = GetStringOpSize(Op); bool Repeat = Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX); if (!Repeat) { Ref Src_RSI = LoadGPRRegister(X86State::REG_RSI, AddrSize); Ref Src_RDI = LoadGPRRegister(X86State::REG_RDI, AddrSize); Ref Dest_RSI = AppendSegmentOffset(Src_RSI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); Ref Dest_RDI = AppendSegmentOffset(Src_RDI, 0, X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); auto Src1 = _LoadMemGPRAutoTSO(Size, Dest_RDI, Size); auto Src2 = _LoadMemGPRAutoTSO(Size, Dest_RSI, Size); CalculateFlags_SUB(OpSizeFromSrc(Op), Src2, Src1); Dest_RDI = OffsetByDir(Src_RDI, IR::OpSizeToSize(Size)); if (Is64BitMode && AddrSize == OpSize::i32Bit) { Dest_RDI = _Bfe(OpSize::i64Bit, 32, 0, Dest_RDI); StoreGPRRegister(X86State::REG_RDI, Dest_RDI); } else { StoreGPRRegister(X86State::REG_RDI, Dest_RDI, AddrSize); } Dest_RSI = OffsetByDir(Src_RSI, IR::OpSizeToSize(Size)); if (Is64BitMode && AddrSize == OpSize::i32Bit) { Dest_RSI = _Bfe(OpSize::i64Bit, 32, 0, Dest_RSI); StoreGPRRegister(X86State::REG_RSI, Dest_RSI); } else { StoreGPRRegister(X86State::REG_RSI, Dest_RSI, AddrSize); } } else { // Calculate flags early. CalculateDeferredFlags(); bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; // If rcx = 0, skip the whole loop. Ref Counter = LoadGPRRegister(X86State::REG_RCX); auto OuterJump = CondJump(Counter, CondClass::EQ); auto BeforeLoop = CreateNewCodeBlockAfter(GetCurrentBlock()); SetFalseJumpTarget(OuterJump, BeforeLoop); SetCurrentCodeBlock(BeforeLoop); StartNewBlock(); ForeachDirection([this, Op, Size, AddrSize, REPE](int32_t PtrDir) { IRPair InnerJump; auto JumpIntoLoop = Jump(); // Setup for the loop auto LoopHeader = CreateNewCodeBlockAfter(GetCurrentBlock()); SetCurrentCodeBlock(LoopHeader); StartNewBlock(); SetJumpTarget(JumpIntoLoop, LoopHeader); // Working loop { Ref Src_RSI = LoadGPRRegister(X86State::REG_RSI, AddrSize); Ref Src_RDI = LoadGPRRegister(X86State::REG_RDI, AddrSize); Ref Dest_RSI = AppendSegmentOffset(Src_RSI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); Ref Dest_RDI = AppendSegmentOffset(Src_RDI, 0, X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); auto Src1 = _LoadMemGPRAutoTSO(Size, Dest_RDI, Size); auto Src2 = _LoadMemGPR(Size, Dest_RSI, Size); // We'll calculate PF/AF after the loop, so use them as temporaries here. StoreRegister(Core::CPUState::PF_AS_GREG, false, Src1); StoreRegister(Core::CPUState::AF_AS_GREG, false, Src2); Ref TailCounter = LoadGPRRegister(X86State::REG_RCX); // Decrement counter TailCounter = SubWithFlags(OpSize::i64Bit, TailCounter, 1); // Store the counter since we don't have phis StoreGPRRegister(X86State::REG_RCX, TailCounter); Dest_RDI = Add(AddrSize, Src_RDI, PtrDir * static_cast(IR::OpSizeToSize(Size))); if (Is64BitMode && AddrSize == OpSize::i32Bit) { Dest_RDI = _Bfe(OpSize::i64Bit, 32, 0, Dest_RDI); StoreGPRRegister(X86State::REG_RDI, Dest_RDI); } else { StoreGPRRegister(X86State::REG_RDI, Dest_RDI, AddrSize); } Dest_RSI = Add(AddrSize, Src_RSI, PtrDir * static_cast(IR::OpSizeToSize(Size))); if (Is64BitMode && AddrSize == OpSize::i32Bit) { Dest_RSI = _Bfe(OpSize::i64Bit, 32, 0, Dest_RSI); StoreGPRRegister(X86State::REG_RSI, Dest_RSI); } else { StoreGPRRegister(X86State::REG_RSI, Dest_RSI, AddrSize); } // If TailCounter != 0, compare sources. // If TailCounter == 0, set ZF iff that would break. _CondSubNZCV(OpSize::i64Bit, Src2, Src1, CondClass::NEQ, REPE ? 0 : (1 << 2) /* Z */); CachedNZCV = nullptr; NZCVDirty = false; InnerJump = CondJumpNZCV(REPE ? CondClass::EQ : CondClass::NEQ); // Jump back to the start if we have more work to do SetTrueJumpTarget(InnerJump, LoopHeader); } // Make sure to start a new block after ending this one auto LoopEnd = CreateNewCodeBlockAfter(GetCurrentBlock()); SetFalseJumpTarget(InnerJump, LoopEnd); SetCurrentCodeBlock(LoopEnd); StartNewBlock(); }); // Make sure to start a new block after ending this one { // Grab the sources from the last iteration so we can set flags. auto Src1 = LoadGPR(Core::CPUState::PF_AS_GREG); auto Src2 = LoadGPR(Core::CPUState::AF_AS_GREG); CalculateFlags_SUB(OpSizeFromSrc(Op), Src2, Src1); } auto Jump_ = Jump(); auto Exit = CreateNewCodeBlockAfter(GetCurrentBlock()); SetJumpTarget(Jump_, Exit); SetTrueJumpTarget(OuterJump, Exit); SetCurrentCodeBlock(Exit); StartNewBlock(); } } void OpDispatchBuilder::LODSOp(OpcodeArgs) { if (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) { LogMan::Msg::EFmt("LODSOp: Can't handle address size override (OP: 0x{:04X}, Flags: 0x{:08X})", Op->OP, Op->Flags); DecodeFailure = true; return; } const auto Size = OpSizeFromSrc(Op); const bool Repeat = (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) != 0; if (!Repeat) { Ref Dest_RSI = MakeSegmentAddress(X86State::REG_RSI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); auto Src = _LoadMemGPRAutoTSO(Size, Dest_RSI, Size); StoreResultGPR(Op, Src); // Offset the pointer Ref TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, IR::OpSizeToSize(Size))); } else { // Calculate flags early. because end of block CalculateDeferredFlags(); ForeachDirection([this, Op, Size](int32_t PtrDir) { // XXX: Theoretically LODS could be optimized to // RSI += {-}(RCX * Size) // RAX = [RSI - Size] // But this might violate the case of an application scanning pages for read permission and catching the fault // May or may not matter auto JumpStart = Jump(); // Make sure to start a new block after ending this one auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); SetJumpTarget(JumpStart, LoopStart); SetCurrentCodeBlock(LoopStart); StartNewBlock(); Ref Counter = LoadGPRRegister(X86State::REG_RCX); // Can we end the block? // We leave if RCX = 0 auto CondJump_ = CondJump(Counter, CondClass::EQ); auto LoopTail = CreateNewCodeBlockAfter(LoopStart); SetFalseJumpTarget(CondJump_, LoopTail); SetCurrentCodeBlock(LoopTail); StartNewBlock(); // Working loop { Ref Dest_RSI = MakeSegmentAddress(X86State::REG_RSI, Op->Flags, X86Tables::DecodeFlags::FLAG_DS_PREFIX); auto Src = _LoadMemGPRAutoTSO(Size, Dest_RSI, Size); StoreResultGPR(Op, Src); Ref TailCounter = LoadGPRRegister(X86State::REG_RCX); Ref TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); // Decrement counter TailCounter = Sub(OpSize::i64Bit, TailCounter, 1); // Store the counter since we don't have phis StoreGPRRegister(X86State::REG_RCX, TailCounter); // Offset the pointer TailDest_RSI = Add(OpSize::i64Bit, TailDest_RSI, PtrDir * static_cast(IR::OpSizeToSize(Size))); StoreGPRRegister(X86State::REG_RSI, TailDest_RSI); // Jump back to the start, we have more work to do Jump(LoopStart); } // Make sure to start a new block after ending this one auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); SetTrueJumpTarget(CondJump_, LoopEnd); SetCurrentCodeBlock(LoopEnd); StartNewBlock(); }); } } void OpDispatchBuilder::SCASOp(OpcodeArgs) { if (!Is64BitMode && (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE)) { LogMan::Msg::EFmt("SCASOp: Address size override (0x67) not supported in 32-bit mode (OP: 0x{:04X}).", Op->OP); DecodeFailure = true; return; } const auto Size = OpSizeFromSrc(Op); OpSize AddrSize = GetStringOpSize(Op); const bool Repeat = (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX)) != 0; if (!Repeat) { Ref Src_RDI = LoadGPRRegister(X86State::REG_RDI, AddrSize); Ref Dest_RDI = AppendSegmentOffset(Src_RDI, 0, X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); auto Src1 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Src2 = _LoadMemGPRAutoTSO(Size, Dest_RDI, Size); CalculateFlags_SUB(OpSizeFromSrc(Op), Src1, Src2); Ref TailDest_RDI = OffsetByDir(Src_RDI, IR::OpSizeToSize(Size)); if (Is64BitMode && AddrSize == OpSize::i32Bit) { TailDest_RDI = _Bfe(OpSize::i64Bit, 32, 0, TailDest_RDI); StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); } else { StoreGPRRegister(X86State::REG_RDI, TailDest_RDI, AddrSize); } } else { // Calculate flags early. because end of block CalculateDeferredFlags(); ForeachDirection([this, Op, Size, AddrSize](int32_t Dir) { bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; auto JumpStart = Jump(); // Make sure to start a new block after ending this one auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); SetJumpTarget(JumpStart, LoopStart); SetCurrentCodeBlock(LoopStart); StartNewBlock(); Ref Counter = LoadGPRRegister(X86State::REG_RCX); // Can we end the block? // We leave if RCX = 0 auto CondJump_ = CondJump(Counter, CondClass::EQ); IRPair InternalCondJump; auto LoopTail = CreateNewCodeBlockAfter(LoopStart); SetFalseJumpTarget(CondJump_, LoopTail); SetCurrentCodeBlock(LoopTail); StartNewBlock(); // Working loop { Ref Src_RDI = LoadGPRRegister(X86State::REG_RDI, AddrSize); Ref Dest_RDI = AppendSegmentOffset(Src_RDI, 0, X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); auto Src1 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Src2 = _LoadMemGPRAutoTSO(Size, Dest_RDI, Size); CalculateFlags_SUB(OpSizeFromSrc(Op), Src1, Src2); // Calculate flags early. CalculateDeferredFlags(); Ref TailCounter = LoadGPRRegister(X86State::REG_RCX); Ref Src_RDI_Tail = LoadGPRRegister(X86State::REG_RDI, AddrSize); // Decrement counter TailCounter = Sub(OpSize::i64Bit, TailCounter, 1); // Store the counter since we don't have phis StoreGPRRegister(X86State::REG_RCX, TailCounter); Ref TailDest_RDI = Add(AddrSize, Src_RDI_Tail, Dir * static_cast(IR::OpSizeToSize(Size))); if (Is64BitMode && AddrSize == OpSize::i32Bit) { TailDest_RDI = _Bfe(OpSize::i64Bit, 32, 0, TailDest_RDI); StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); } else { StoreGPRRegister(X86State::REG_RDI, TailDest_RDI, AddrSize); } CalculateDeferredFlags(); InternalCondJump = CondJumpNZCV(REPE ? CondClass::EQ : CondClass::NEQ); // Jump back to the start if we have more work to do SetTrueJumpTarget(InternalCondJump, LoopStart); } // Make sure to start a new block after ending this one auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); SetTrueJumpTarget(CondJump_, LoopEnd); SetFalseJumpTarget(InternalCondJump, LoopEnd); SetCurrentCodeBlock(LoopEnd); StartNewBlock(); }); } } void OpDispatchBuilder::BSWAPOp(OpcodeArgs) { Ref Dest; const auto Size = OpSizeFromSrc(Op); if (Size == OpSize::i16Bit) { // BSWAP of 16bit is undef. ZEN+ causes the lower 16bits to get zero'd Dest = Constant(0); } else { Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, GetGPROpSize(), Op->Flags); Dest = _Rev(Size, Dest); } StoreResultGPR(Op, Dest); } void OpDispatchBuilder::PUSHFOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Push(Size, GetPackedRFLAG()); } void OpDispatchBuilder::POPFOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); Ref Src = Pop(Size); // Add back our flag constants // Bit 1 is always 1 // Bit 9 is always 1 because we always have interrupts enabled Src = _Or(OpSize::i64Bit, Src, Constant(0x202)); SetPackedRFLAG(false, Src); auto NewRIP = GetRelocatedPC(Op); ExitFunction(NewRIP, BranchHint::CheckTF); BlockSetRIP = true; } void OpDispatchBuilder::NEGOp(OpcodeArgs) { HandledLock = (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK) != 0; const auto Size = OpSizeFromSrc(Op); auto ZeroConst = Constant(0); if (DestIsLockedMem(Op)) { Ref DestMem = MakeSegmentAddress(Op, Op->Dest); Ref Dest = _AtomicFetchNeg(Size, DestMem); CalculateFlags_SUB(Size, ZeroConst, Dest); } else { Ref Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); Ref Result = CalculateFlags_SUB(Size, ZeroConst, Dest); StoreResultGPR(Op, Result); } } void OpDispatchBuilder::DIVOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); auto Size = OpSizeFromSrc(Op); // This loads the divisor. 32-bit/64-bit paths mask inside the JIT, 8/16 do not. Ref Divisor = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = Size >= OpSize::i32Bit}); if (Size == OpSize::i64Bit && !Is64BitMode) { LogMan::Msg::EFmt("Doesn't exist in 32bit mode"); DecodeFailure = true; return; } Ref Quotient = _AllocateGPR(true); Ref Remainder = _AllocateGPR(true); if (Size == OpSize::i8Bit) { Ref Src1 = LoadGPRRegister(X86State::REG_RAX, OpSize::i16Bit); _UDiv(OpSize::i16Bit, Src1, Invalid(), Divisor, Quotient, Remainder); // AX[15:0] = concat auto ResultAX = _Bfi(GPRSize, 8, 8, Quotient, Remainder); StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit); } else { Ref Src1 = LoadGPRRegister(X86State::REG_RAX); Ref Src2 = LoadGPRRegister(X86State::REG_RDX); _UDiv(Size, Src1, Src2, Divisor, Quotient, Remainder); if (Size == OpSize::i32Bit) { Quotient = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Quotient); Remainder = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Remainder); Size = OpSize::iInvalid; } StoreGPRRegister(X86State::REG_RAX, Quotient, Size); StoreGPRRegister(X86State::REG_RDX, Remainder, Size); } } void OpDispatchBuilder::IDIVOp(OpcodeArgs) { // This loads the divisor Ref Divisor = LoadSourceGPR(Op, Op->Dest, Op->Flags); const auto GPRSize = GetGPROpSize(); auto Size = OpSizeFromSrc(Op); if (Size == OpSize::i64Bit && !Is64BitMode) { LogMan::Msg::EFmt("Doesn't exist in 32bit mode"); DecodeFailure = true; return; } Ref Quotient = _AllocateGPR(true); Ref Remainder = _AllocateGPR(true); if (Size == OpSize::i8Bit) { Ref Src1 = LoadGPRRegister(X86State::REG_RAX); Src1 = _Sbfe(OpSize::i64Bit, 16, 0, Src1); Divisor = _Sbfe(OpSize::i64Bit, 8, 0, Divisor); _Div(OpSize::i64Bit, Src1, Invalid(), Divisor, Quotient, Remainder); // AX[15:0] = concat auto ResultAX = _Bfi(GPRSize, 8, 8, Quotient, Remainder); StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit); } else { Ref Src1 = LoadGPRRegister(X86State::REG_RAX); Ref Src2 = LoadGPRRegister(X86State::REG_RDX); _Div(Size, Src1, Src2, Divisor, Quotient, Remainder); if (Size == OpSize::i32Bit) { Quotient = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Quotient); Remainder = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, Remainder); Size = OpSize::iInvalid; } StoreGPRRegister(X86State::REG_RAX, Quotient, Size); StoreGPRRegister(X86State::REG_RDX, Remainder, Size); } } void OpDispatchBuilder::BSFOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto DstSize = OpSizeFromDst(Op) == OpSize::i16Bit ? OpSize::i16Bit : GPRSize; Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, DstSize, Op->Flags, {.AllowUpperGarbage = true}); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); // Find the LSB of this source auto Result = _FindLSB(OpSizeFromSrc(Op), Src); // OF, SF, AF, PF, CF all undefined // ZF is set to 1 if the source was zero SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src); // If Src was zero then the destination doesn't get modified. // // Although Intel does not guarantee that semantic, AMD does and Intel // hardware satisfies it. We provide the stronger AMD behaviour as // applications might rely on that in the wild. auto SelectOp = NZCVSelect(GPRSize, CondClass::EQ, Dest, Result); StoreResultGPR_WithOpSize(Op, Op->Dest, SelectOp, DstSize); } void OpDispatchBuilder::BSROp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto DstSize = OpSizeFromDst(Op) == OpSize::i16Bit ? OpSize::i16Bit : GPRSize; Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, DstSize, Op->Flags, {.AllowUpperGarbage = true}); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); // Find the MSB of this source auto Result = _FindMSB(OpSizeFromSrc(Op), Src); // OF, SF, AF, PF, CF all undefined // ZF is set to 1 if the source was zero SetZ_InvalidateNCV(OpSizeFromSrc(Op), Src); // If Src was zero then the destination doesn't get modified auto SelectOp = NZCVSelect(GPRSize, CondClass::EQ, Dest, Result); StoreResultGPR_WithOpSize(Op, Op->Dest, SelectOp, DstSize); } void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { // CMPXCHG ModRM, reg, {RAX} // MemData = *ModRM.dest // if (RAX == MemData) // modRM.dest = reg; // ZF = 1 // else // ZF = 0 // RAX = MemData // // CASL Xs, Xt, Xn // MemData = *Xn // if (MemData == Xs) // *Xn = Xt // Xs = MemData const auto GPRSize = GetGPROpSize(); auto Size = OpSizeFromSrc(Op); if (Op->Dest.IsGPR()) { // This is our source register Ref Src2 = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); Ref Src3 = LoadGPRRegister(X86State::REG_RAX); // If the destination is also the accumulator, we get some algebraic // simplifications. Not sure if this is actually hit but it's in // InstCountCI. bool Trivial = Op->Dest.Data.GPR.GPR == X86State::REG_RAX && !Op->Dest.IsGPRDirect() && !Op->Dest.Data.GPR.HighBits; Ref Src1 {}; Ref Src1Lower {}; if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { Src1 = LoadSourceGPR_WithOpSize(Op, Op->Dest, GPRSize, Op->Flags, {.AllowUpperGarbage = true}); Src1Lower = Trivial ? Src1 : _Bfe(GPRSize, IR::OpSizeAsBits(Size), 0, Src1); } else { Src1 = LoadSourceGPR_WithOpSize(Op, Op->Dest, Size, Op->Flags, {.AllowUpperGarbage = true}); Src1Lower = Src1; } // Compare RAX with the destination, setting flags accordingly. CalculateFlags_SUB(OpSizeFromSrc(Op), Src3, Src1Lower); CalculateDeferredFlags(); if (!Trivial) { if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { // This allows us to only hit the ZEXT case on failure Ref RAXResult = NZCVSelect(OpSize::i64Bit, CondClass::EQ, Src3, Src1Lower); // When the size is 4 we need to make sure not zext the GPR when the comparison fails StoreGPRRegister(X86State::REG_RAX, RAXResult); } else { StoreGPRRegister(X86State::REG_RAX, Src1Lower, Size); } } // Op1 = RAX == Op1 ? Op2 : Op1 // If they match then set the rm operand to the input // else don't set the rm operand Ref Src2Lower = Src2; if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { Src2Lower = _Bfe(GPRSize, IR::OpSizeAsBits(Size), 0, Src2); } Ref DestResult = Trivial ? Src2 : NZCVSelect(OpSize::i64Bit, CondClass::EQ, Src2Lower, Src1); // Store in to GPR Dest if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { StoreResultGPR_WithOpSize(Op, Op->Dest, DestResult, GPRSize); } else { StoreResultGPR(Op, DestResult); } } else { Ref Src2 = LoadSourceGPR(Op, Op->Src[0], Op->Flags); HandledLock = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK; auto Src3 = LoadGPRRegister(X86State::REG_RAX); auto Src3Lower = _Bfe(OpSize::i64Bit, OpSizeAsBits(Size), 0, Src3); // If this is a memory location then we want the pointer to it Ref Src1 = MakeSegmentAddress(Op, Op->Dest); // DataSrc = *Src1 // if (DataSrc == Src3) { *Src1 == Src2; } Src2 = DataSrc // This will write to memory! Careful! // Third operand must be a calculated guest memory address Ref CASResult = _CAS(Size, Src3, Src2, Src1); Ref RAXResult = CASResult; CalculateFlags_SUB(OpSizeFromSrc(Op), Src3Lower, CASResult); CalculateDeferredFlags(); if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { // This allows us to only hit the ZEXT case on failure RAXResult = _NZCVSelect(OpSize::i64Bit, CondClass::EQ, Src3, CASResult); Size = OpSize::i64Bit; } // RAX gets the result of the CAS op StoreGPRRegister(X86State::REG_RAX, RAXResult, Size); } } void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) { // Calculate flags early. CalculateDeferredFlags(); // REX.W used to determine if it is 16byte or 8byte // Unlike CMPXCHG, the destination can only be a memory location const auto Size = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REX_WIDENING ? OpSize::i64Bit : OpSize::i32Bit; HandledLock = (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK) != 0; // If this is a memory location then we want the pointer to it Ref Src1 = MakeSegmentAddress(Op, Op->Dest); // Load the full 64-bit registers, all the users ignore the upper 32-bits for // 32-bit only cmpxchg. This saves some zero extension. Ref Expected_Lower = LoadGPRRegister(X86State::REG_RAX); Ref Expected_Upper = LoadGPRRegister(X86State::REG_RDX); Ref Desired_Lower = LoadGPRRegister(X86State::REG_RBX); Ref Desired_Upper = LoadGPRRegister(X86State::REG_RCX); // ssa0 = Expected // ssa1 = Desired // ssa2 = MemoryLocation // DataSrc = *MemSrc // if (DataSrc == Expected) { *MemSrc == Desired; } Expected = DataSrc // This will write to memory! Careful! // Third operand must be a calculated guest memory address Ref Result_Lower = _AllocateGPR(true); Ref Result_Upper = _AllocateGPRAfter(Result_Lower); _CASPair(Size, Expected_Lower, Expected_Upper, Desired_Lower, Desired_Upper, Src1, Result_Lower, Result_Upper); HandleNZCV_RMW(); _CmpPairZ(Size, Result_Lower, Result_Upper, Expected_Lower, Expected_Upper); CalculateDeferredFlags(); auto UpdateIfNotZF = [this](auto Reg, auto Value) { // Always use 64-bit csel to preserve existing upper bits. If we have a // 32-bit cmpxchg in a 64-bit context, Value will be zeroed in upper bits. StoreGPRRegister(Reg, NZCVSelect(OpSize::i64Bit, CondClass::NEQ, Value, LoadGPRRegister(Reg))); }; UpdateIfNotZF(X86State::REG_RAX, Result_Lower); UpdateIfNotZF(X86State::REG_RDX, Result_Upper); } void OpDispatchBuilder::CreateJumpBlocks(const fextl::vector* Blocks) { Ref PrevCodeBlock {}; for (auto& Target : *Blocks) { auto CodeNode = CreateCodeNode(Target.IsEntryPoint, Target.Entry - Entry); JumpTargets.try_emplace(Target.Entry, JumpTargetInfo {CodeNode, false, Target.IsEntryPoint}); if (PrevCodeBlock) { LinkCodeBlocks(PrevCodeBlock, CodeNode); } PrevCodeBlock = CodeNode; } } void OpDispatchBuilder::BeginFunction(uint64_t RIP, const fextl::vector* Blocks, uint32_t NumInstructions, bool _Is64BitMode, bool MonoBackpatcherBlock) { Entry = RIP; Is64BitMode = _Is64BitMode; LOGMAN_THROW_A_FMT(Is64BitMode == CTX->Config.Is64BitMode, "Expected operating mode to not change at runtime!"); IsMonoBackpatcherBlock = MonoBackpatcherBlock; auto IRHeader = _IRHeader(InvalidNode, RIP, 0, NumInstructions, 0, 0); CreateJumpBlocks(Blocks); auto Block = GetNewJumpBlock(RIP); SetCurrentCodeBlock(Block); IRHeader.first->Blocks = Block->Wrapped(DualListData.ListBegin()); CurrentHeader = IRHeader.first; } void OpDispatchBuilder::Finalize() { // This usually doesn't emit any IR but in the case of hitting the block instruction limit it will FlushRegisterCache(); const auto GPRSize = GetGPROpSize(); // Node 0 is invalid node Ref RealNode = reinterpret_cast(GetNode(1)); const FEXCore::IR::IROp_Header* IROp = RealNode->Op(DualListData.DataBegin()); LOGMAN_THROW_A_FMT(IROp->Op == OP_IRHEADER, "First op in function must be our header"); // Let's walk the jump blocks and see if we have handled every block target for (auto& Handler : JumpTargets) { if (Handler.second.HaveEmitted) { continue; } // We haven't emitted. Dump out to the dispatcher SetCurrentCodeBlock(Handler.second.BlockEntry); ExitFunction(_InlineEntrypointOffset(GPRSize, Handler.first - Entry)); } } uint8_t OpDispatchBuilder::GetDstSize(X86Tables::DecodedOp Op) const { const uint32_t DstSizeFlag = X86Tables::DecodeFlags::GetSizeDstFlags(Op->Flags); LOGMAN_THROW_A_FMT(DstSizeFlag != 0 && DstSizeFlag != X86Tables::DecodeFlags::SIZE_MASK, "Invalid destination size for op"); return 1u << (DstSizeFlag - 1); } uint8_t OpDispatchBuilder::GetSrcSize(X86Tables::DecodedOp Op) const { const uint32_t SrcSizeFlag = X86Tables::DecodeFlags::GetSizeSrcFlags(Op->Flags); LOGMAN_THROW_A_FMT(SrcSizeFlag != 0 && SrcSizeFlag != X86Tables::DecodeFlags::SIZE_MASK, "Invalid destination size for op"); return 1u << (SrcSizeFlag - 1); } uint32_t OpDispatchBuilder::GetSrcBitSize(X86Tables::DecodedOp Op) const { return GetSrcSize(Op) * 8; } uint32_t OpDispatchBuilder::GetDstBitSize(X86Tables::DecodedOp Op) const { return GetDstSize(Op) * 8; } Ref OpDispatchBuilder::GetSegment(uint32_t Flags, uint32_t DefaultPrefix, bool Override) { const auto GPRSize = GetGPROpSize(); uint32_t Prefix = Flags & FEXCore::X86Tables::DecodeFlags::FLAG_SEGMENTS; if (Is64BitMode) { if (Prefix == FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX) { return _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, fs_cached)); } else if (Prefix == FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX) { return _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, gs_cached)); } // If there was any other segment in 64bit then it is ignored } else { if (Prefix == FEXCore::X86Tables::DecodeFlags::FLAG_NO_PREFIX || Override) { // If there was no prefix then use the default one if available // Or the argument only uses a specific prefix (with override set) Prefix = DefaultPrefix; } // With the segment register optimization we store the GDT bases directly in the segment register to remove indexed loads Ref SegmentResult {}; switch (Prefix) { [[likely]] case FEXCore::X86Tables::DecodeFlags::FLAG_NO_PREFIX: return nullptr; case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: SegmentResult = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, es_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: SegmentResult = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, cs_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: SegmentResult = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, ss_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: SegmentResult = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, ds_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX: SegmentResult = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, fs_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX: SegmentResult = _LoadContextGPR(GPRSize, offsetof(FEXCore::Core::CPUState, gs_cached)); break; default: FEX_UNREACHABLE; } CheckLegacySegmentRead(SegmentResult, Prefix); return SegmentResult; } return nullptr; } Ref OpDispatchBuilder::AppendSegmentOffset(Ref Value, uint32_t Flags, uint32_t DefaultPrefix, bool Override) { auto Segment = GetSegment(Flags, DefaultPrefix, Override); if (Segment) { Value = Add(std::max(OpSize::i32Bit, std::max(GetOpSize(Value), GetOpSize(Segment))), Value, Segment); } return Value; } void OpDispatchBuilder::CheckLegacySegmentRead(Ref NewNode, uint32_t SegmentReg) { #ifndef FEX_DISABLE_TELEMETRY if (SegmentReg == FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX || SegmentReg == FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX) { // FS and GS segments aren't considered legacy. return; } if (!(SegmentsNeedReadCheck & SegmentReg)) { // If the block has done multiple reads of a segment register then skip redundant read checks. // Segment write will cause another read check. return; } if (CTX->Config.DisableTelemetry()) { // Telemetry disabled at runtime. return; } FEXCore::Telemetry::TelemetryType TelemIndex {}; switch (SegmentReg) { case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_USES_32BIT_SEGMENT_ES; SegmentsNeedReadCheck &= ~FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX; break; case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_USES_32BIT_SEGMENT_CS; SegmentsNeedReadCheck &= ~FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX; break; case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_USES_32BIT_SEGMENT_SS; SegmentsNeedReadCheck &= ~FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX; break; case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_USES_32BIT_SEGMENT_DS; SegmentsNeedReadCheck &= ~FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX; break; default: FEX_UNREACHABLE; } // Will set the telemetry value if NewNode is != 0 _TelemetrySetValue(NewNode, TelemIndex); // Telemetry will dirty flags, and user code does not expect LoadSource to clobber flags, fix that up here as this is an edge case. CalculateDeferredFlags(); #endif } void OpDispatchBuilder::CheckLegacySegmentWrite(Ref NewNode, uint32_t SegmentReg) { #ifndef FEX_DISABLE_TELEMETRY if (SegmentReg == FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX || SegmentReg == FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX) { // FS and GS segments aren't considered legacy. return; } if (CTX->Config.DisableTelemetry()) { // Telemetry disabled at runtime. return; } FEXCore::Telemetry::TelemetryType TelemIndex {}; switch (SegmentReg) { case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_WRITES_32BIT_SEGMENT_ES; SegmentsNeedReadCheck |= FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX; break; case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_WRITES_32BIT_SEGMENT_CS; SegmentsNeedReadCheck |= FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX; break; case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_WRITES_32BIT_SEGMENT_SS; SegmentsNeedReadCheck |= FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX; break; case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: TelemIndex = FEXCore::Telemetry::TelemetryType::TYPE_WRITES_32BIT_SEGMENT_DS; SegmentsNeedReadCheck |= FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX; break; default: FEX_UNREACHABLE; } // Will set the telemetry value if NewNode is != 0 _TelemetrySetValue(NewNode, TelemIndex); // Telemetry will dirty flags, and user code does not expect LoadSource to clobber flags, fix that up here as this is an edge case. CalculateDeferredFlags(); #endif } void OpDispatchBuilder::UpdatePrefixFromSegment(Ref Segment, uint32_t SegmentReg) { // Use BFE to extract the selector index in bits [15,3] of the segment register. // In some cases the upper 16-bits of the 32-bit GPR contain garbage to ignore. auto GDT = _Bfe(OpSize::i32Bit, 1, 2, Segment); // Fun quirk, if we mask the selector then it is premultiplied by 8 which we need to do for accessing anyway. auto SegmentOffset = _And(OpSize::i32Bit, Segment, _Constant(0xfff8)); Ref SegmentBase = _LoadContextGPRIndexed(GDT, OpSize::i64Bit, offsetof(FEXCore::Core::CPUState, segment_arrays[0]), 8); Ref NewSegment = _LoadMemGPR(OpSize::i64Bit, SegmentBase, SegmentOffset, OpSize::i8Bit, MemOffsetType::UXTW, 1); CheckLegacySegmentWrite(NewSegment, SegmentReg); // Extract the 32-bit base from the GDT segment. auto Upper32 = _Lshr(OpSize::i64Bit, NewSegment, _Constant(32)); auto Masked = _And(OpSize::i32Bit, Upper32, _Constant(0xFF00'0000)); Ref Merged = _Orlshr(OpSize::i32Bit, Masked, NewSegment, 16); NewSegment = _Bfi(OpSize::i32Bit, 8, 16, Merged, Upper32); switch (SegmentReg) { case FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX: _StoreContextGPR(OpSize::i32Bit, NewSegment, offsetof(FEXCore::Core::CPUState, es_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX: _StoreContextGPR(OpSize::i32Bit, NewSegment, offsetof(FEXCore::Core::CPUState, cs_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX: _StoreContextGPR(OpSize::i32Bit, NewSegment, offsetof(FEXCore::Core::CPUState, ss_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX: _StoreContextGPR(OpSize::i32Bit, NewSegment, offsetof(FEXCore::Core::CPUState, ds_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX: _StoreContextGPR(OpSize::i32Bit, NewSegment, offsetof(FEXCore::Core::CPUState, fs_cached)); break; case FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX: _StoreContextGPR(OpSize::i32Bit, NewSegment, offsetof(FEXCore::Core::CPUState, gs_cached)); break; default: break; // Do nothing } } AddressMode OpDispatchBuilder::DecodeAddress(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, MemoryAccessType AccessType, bool IsLoad) { const auto GPRSize = GetGPROpSize(); AddressMode A {}; A.Segment = GetSegment(Op->Flags); A.AddrSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) != 0 ? (GPRSize >> 1) : GPRSize; A.NonTSO = AccessType == MemoryAccessType::NONTSO || AccessType == MemoryAccessType::STREAM; if (Operand.IsLiteral()) { A.Offset = Operand.Literal(); if (Operand.Data.Literal.Size != 8 && IsLoad) { // zero extend uint64_t width = Operand.Data.Literal.Size * 8; A.Offset &= ((1ULL << width) - 1); } } else if (Operand.IsGPR()) { // Not an address, let the caller deal with it A.AddrSize = GPRSize; } else if (Operand.IsGPRDirect()) { A.Base = LoadGPRRegister(Operand.Data.GPR.GPR, GPRSize); A.NonTSO |= IsNonTSOReg(AccessType, Operand.Data.GPR.GPR); } else if (Operand.IsGPRIndirect() || Operand.IsGPRIndirectRelocation()) { A.Base = LoadGPRRegister(Operand.Data.GPRIndirect.GPR, GPRSize); if (Operand.IsGPRIndirectRelocation()) { A.Base = Add(GPRSize, _EntrypointOffset(GPRSize, Operand.Data.GPRIndirect.Displacement), A.Base); } else { A.Offset = static_cast(Operand.Data.GPRIndirect.Displacement); } A.NonTSO |= IsNonTSOReg(AccessType, Operand.Data.GPRIndirect.GPR); } else if (Operand.IsRIPRelative() || Operand.IsRIPRelativeRelocation()) { if (Is64BitMode) { A.Base = GetRelocatedPC(Op, static_cast(Operand.Data.RIPLiteral.Value)); } else { // 32bit this isn't RIP relative but instead absolute if (Operand.IsRIPRelativeRelocation()) { A.Base = _EntrypointOffset(GPRSize, Operand.Data.RIPLiteral.Value); } else { A.Offset = Operand.Data.RIPLiteral.Value; } } } else if (Operand.IsSIB() || Operand.IsSIBRelocation()) { const bool IsVSIB = IsLoad && ((Op->Flags & X86Tables::DecodeFlags::FLAG_VSIB_BYTE) != 0); if (Operand.Data.SIB.Base != FEXCore::X86State::REG_INVALID) { A.Base = LoadGPRRegister(Operand.Data.SIB.Base, GPRSize); } // NOTE: VSIB cannot have the index * scale portion calculated ahead of time, // since the index in this case is a vector. So, we can't just apply the scale // to it, since this needs to be applied to each element in the index register // after said element has been sign extended. So, we pass this through for the // instruction implementation to handle. // // What we do handle though, is the applying the displacement value to // the base register (if a base register is provided), since this is a // part of the address calculation that can be done ahead of time. if (!IsVSIB && Operand.Data.SIB.Index != FEXCore::X86State::REG_INVALID) { A.Index = LoadGPRRegister(Operand.Data.SIB.Index, GPRSize); A.IndexScale = Operand.Data.SIB.Scale; } if (Operand.IsSIBRelocation()) { auto EPOffset = _EntrypointOffset(GPRSize, Operand.Data.SIB.Offset); if (A.Base) { A.Base = Add(GPRSize, EPOffset, A.Base); } else { A.Base = EPOffset; } } else { A.Offset = static_cast(Operand.Data.SIB.Offset); } A.NonTSO |= IsNonTSOReg(AccessType, Operand.Data.SIB.Base) || IsNonTSOReg(AccessType, Operand.Data.SIB.Index); } else if (Operand.IsLiteralRelocation()) { A.Base = _EntrypointOffset(GPRSize, Operand.Data.LiteralRelocation.EntrypointOffset); } else { LOGMAN_MSG_A_FMT("Unknown Src Type: {}\n", Operand.Type); } return A; } Ref OpDispatchBuilder::LoadSource_WithOpSize(RegClass Class, const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, IR::OpSize OpSize, uint32_t Flags, const LoadSourceOptions& Options) { auto [Align, LoadData, ForceLoad, AccessType, AllowUpperGarbage] = Options; AddressMode A = DecodeAddress(Op, Operand, AccessType, true /* IsLoad */); if (Operand.IsGPR()) { const auto gpr = Operand.Data.GPR.GPR; const auto highIndex = Operand.Data.GPR.HighBits ? 1 : 0; if (gpr >= FEXCore::X86State::REG_MM_0) { LOGMAN_THROW_A_FMT(OpSize == OpSize::i64Bit, "full"); if (MMXState != MMXState_MMX) { ChgStateX87_MMX(); } A.Base = LoadContext(OpSize::i64Bit, MM0Index + gpr - FEXCore::X86State::REG_MM_0); } else if (gpr >= FEXCore::X86State::REG_XMM_0) { const auto gprIndex = gpr - X86State::REG_XMM_0; // Load the full register size if it is a XMM register source. A.Base = LoadXMMRegister(gprIndex); // Now extract the subregister if it was a partial load /smaller/ than SSE size // TODO: Instead of doing the VMov implicitly on load, hunt down all use cases that require partial loads and do it after load. // We don't have information here to know if the operation needs zero upper bits or can contain data. if (!AllowUpperGarbage && OpSize < OpSize::i128Bit) { A.Base = _VMov(OpSize, A.Base); } } else { A.Base = LoadGPRRegister(gpr, OpSize, highIndex ? 8 : 0, AllowUpperGarbage); } } if ((IsOperandMem(Operand, true) && LoadData) || ForceLoad) { if (OpSize == OpSize::f80Bit) { Ref MemSrc = LoadEffectiveAddress(this, A, GetGPROpSize(), true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { return _LoadMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, MemSrc); } else { // For X87 extended doubles, Split the load. auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align); return _VLoadVectorElement(OpSize::i128Bit, OpSize::i16Bit, Res, 4, Add(OpSize::i64Bit, MemSrc, 8)); } } return _LoadMemAutoTSO(Class, OpSize, A, Align == OpSize::iInvalid ? OpSize : Align); } else { return LoadEffectiveAddress(this, A, GetGPROpSize(), false, AllowUpperGarbage); } } Ref OpDispatchBuilder::LoadGPRRegister(uint32_t GPR, IR::OpSize Size, uint8_t Offset, bool AllowUpperGarbage) { const auto GPRSize = GetGPROpSize(); if (Size == OpSize::iInvalid) { Size = GPRSize; } Ref Reg = LoadGPR(GPR); if ((!AllowUpperGarbage && (Size != GPRSize)) || Offset != 0) { // Extract the subregister if requested. const auto OpSize = std::max(OpSize::i32Bit, Size); if (AllowUpperGarbage) { Reg = _Lshr(OpSize, Reg, Constant(Offset)); } else { Reg = _Bfe(OpSize, IR::OpSizeAsBits(Size), Offset, Reg); } } return Reg; } void OpDispatchBuilder::StoreGPRRegister(uint32_t GPR, const Ref Src, IR::OpSize Size, uint8_t Offset) { const auto GPRSize = GetGPROpSize(); if (Size == OpSize::iInvalid) { Size = GPRSize; } Ref Reg = Src; if (Size != GPRSize || Offset != 0) { // Need to do an insert if not automatic size or zero offset. Reg = ARef(Reg).BfiInto(LoadGPRRegister(GPR), Offset, IR::OpSizeAsBits(Size)); } StoreRegister(GPR, false, Reg); } void OpDispatchBuilder::StoreXMMRegister(uint32_t XMM, const Ref Src) { StoreRegister(XMM, true, Src); } Ref OpDispatchBuilder::LoadSource(RegClass Class, const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, const LoadSourceOptions& Options) { const auto OpSize = OpSizeFromSrc(Op); return LoadSource_WithOpSize(Class, Op, Operand, OpSize, Flags, Options); } void OpDispatchBuilder::StoreResult_WithOpSize(RegClass Class, FEXCore::X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, IR::OpSize OpSize, IR::OpSize Align, MemoryAccessType AccessType) { if (Operand.IsGPR()) { // 8Bit and 16bit destination types store their result without effecting the upper bits // 32bit ops ZEXT the result to 64bit const auto GPRSize = GetGPROpSize(); const auto gpr = Operand.Data.GPR.GPR; if (gpr >= FEXCore::X86State::REG_MM_0) { LOGMAN_THROW_A_FMT(OpSize == OpSize::i64Bit, "full"); LOGMAN_THROW_A_FMT(Class == RegClass::FPR, "MMX is floaty"); if (MMXState != MMXState_MMX) { ChgStateX87_MMX(); } uint8_t Index = MM0Index + gpr - FEXCore::X86State::REG_MM_0; StoreContext(Index, Src); RegCache.Partial |= (1ull << (uint64_t)Index); } else if (gpr >= FEXCore::X86State::REG_XMM_0) { const auto gprIndex = gpr - X86State::REG_XMM_0; const auto VectorSize = GetGuestVectorLength(); auto Result = Src; if (OpSize != VectorSize) { // Partial writes can come from FPRs. // TODO: Fix the instructions doing partial writes rather than dealing with it here. LOGMAN_THROW_A_FMT(Class != RegClass::GPR, "Partial writes from GPR not allowed. Instruction: {}", Op->TableInfo->Name); // XMM-size is handled in implementations. if (VectorSize != OpSize::i256Bit || OpSize != OpSize::i128Bit) { auto SrcVector = LoadXMMRegister(gprIndex); Result = _VInsElement(VectorSize, OpSize, 0, 0, SrcVector, Src); } } StoreXMMRegister(gprIndex, Result); } else { if (GPRSize == OpSize::i64Bit && OpSize == OpSize::i32Bit) { // If the Source IR op is 64 bits, we need to zext the upper bits // For all other sizes, the upper bits are guaranteed to already be zero Ref Value = GetOpSize(Src) == OpSize::i64Bit ? ARef(Src).Bfe(0, 32).Ref() : Src; StoreGPRRegister(gpr, Value, GPRSize); LOGMAN_THROW_A_FMT(!Operand.Data.GPR.HighBits, "Can't handle 32bit store to high 8bit register"); } else { LOGMAN_THROW_A_FMT(!(GPRSize == OpSize::i32Bit && OpSize > OpSize::i32Bit), "Oops had a {} GPR load", OpSize); if (GPRSize != OpSize) { // if the GPR isn't the full size then we need to insert. // eg: // mov al, 2 ; Move in to lower 8-bits. // mov ah, 2 ; Move in to upper 8-bits of 16-bit reg. // mov ax, 2 ; Move in to lower 16-bits of reg. StoreGPRRegister(gpr, Src, OpSize, Operand.Data.GPR.HighBits * 8); } else { StoreGPRRegister(gpr, Src, std::min(GPRSize, OpSize)); } } } return; } AddressMode A = DecodeAddress(Op, Operand, AccessType, false /* IsLoad */); if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(this, A, GetGPROpSize(), true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { _StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, MemStoreDst); } else { // For X87 extended doubles, split before storing _StoreMemFPR(OpSize::i64Bit, MemStoreDst, Src, Align); auto Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 1); _StoreMemGPR(OpSize::i16Bit, Upper, MemStoreDst, Constant(8), std::min(Align, OpSize::i64Bit), MemOffsetType::SXTX, 1); } } else { _StoreMemAutoTSO(Class, OpSize, A, Src, Align == OpSize::iInvalid ? OpSize : Align); } } void OpDispatchBuilder::StoreResult(RegClass Class, X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, IR::OpSize Align, MemoryAccessType AccessType) { StoreResult_WithOpSize(Class, Op, Operand, Src, OpSizeFromDst(Op), Align, AccessType); } void OpDispatchBuilder::StoreResult(RegClass Class, X86Tables::DecodedOp Op, Ref Src, IR::OpSize Align, MemoryAccessType AccessType) { StoreResult(Class, Op, Op->Dest, Src, Align, AccessType); } OpDispatchBuilder::OpDispatchBuilder(FEXCore::Context::ContextImpl* ctx) : IREmitter {ctx->OpDispatcherAllocator, ctx->HostFeatures.SupportsTSOImm9} , CTX {ctx} { if (CTX->HostFeatures.SupportsAVX && CTX->HostFeatures.SupportsSVE256) { SaveAVXStateFunc = &OpDispatchBuilder::SaveAVXState; RestoreAVXStateFunc = &OpDispatchBuilder::RestoreAVXState; DefaultAVXStateFunc = &OpDispatchBuilder::DefaultAVXState; } else if (CTX->HostFeatures.SupportsAVX) { SaveAVXStateFunc = &OpDispatchBuilder::AVX128_SaveAVXState; RestoreAVXStateFunc = &OpDispatchBuilder::AVX128_RestoreAVXState; DefaultAVXStateFunc = &OpDispatchBuilder::AVX128_DefaultAVXState; } } void OpDispatchBuilder::ResetWorkingList() { IREmitter::ReownOrClaimBuffer(); JumpTargets.clear(); BlockSetRIP = false; DecodeFailure = false; ShouldDump = false; CurrentCodeBlock = nullptr; RegCache.Written = 0; RegCache.Cached = 0; } void OpDispatchBuilder::UnhandledOp(OpcodeArgs) { DecodeFailure = true; } void OpDispatchBuilder::MOVGPROp(OpcodeArgs, uint32_t SrcIndex) { // StoreResult will store with the same size as the input, so we allow upper // garbage on the input. The zero extension would be pointless. Ref Src = LoadSourceGPR(Op, Op->Src[SrcIndex], Op->Flags, {.Align = OpSize::i8Bit, .AllowUpperGarbage = true}); StoreResultGPR(Op, Src, OpSize::i8Bit); } void OpDispatchBuilder::MOVGPRNTOp(OpcodeArgs) { Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit}); StoreResultGPR(Op, Src, OpSize::i8Bit, MemoryAccessType::STREAM); } void OpDispatchBuilder::ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::IR::IROps AtomicFetchOp, unsigned SrcIdx) { // On x86, the canonical way to zero a register is XOR with itself. Detect and // emit optimal arm64 assembly. if (!DestIsLockedMem(Op) && ALUIROp == FEXCore::IR::IROps::OP_XOR && Op->Dest.IsGPR() && Op->Src[SrcIdx].IsGPR() && Op->Dest.Data.GPR == Op->Src[SrcIdx].Data.GPR) { // Set flags for zero result with inverted carry. We subtract an arbitrary // register from itself to get the zero, since `subs wzr, #0` is not // encodable. This is optimal and works regardless of the opsize. auto Zero = LoadGPR(Op->Dest.Data.GPR.GPR); HandleNZ00Write(); InvalidateAF(); CalculatePF(SubWithFlags(OpSize::i32Bit, Zero, Zero)); CFInverted = true; FlushRegisterCache(); // Move 0 into the register StoreResultGPR(Op, Constant(0)); return; } auto Size = OpSizeFromDst(Op); auto ResultSize = Size; auto RoundedSize = Size; if (ALUIROp != FEXCore::IR::IROps::OP_ANDWITHFLAGS) { RoundedSize = std::max(OpSize::i32Bit, RoundedSize); } // X86 basic ALU ops just do the operation between the destination and a single source Ref Src = LoadSourceGPR(Op, Op->Src[SrcIdx], Op->Flags, {.AllowUpperGarbage = true}); // Try to eliminate the masking after 8/16-bit operations with constants, by // promoting to a full size operation that preserves the upper bits. uint64_t Const; bool IsConst = IsValueConstant(WrapNode(Src), &Const); if (Size < OpSize::i32Bit && !DestIsLockedMem(Op) && Op->Dest.IsGPR() && !Op->Dest.Data.GPR.HighBits && IsConst && (ALUIROp == IR::IROps::OP_XOR || ALUIROp == IR::IROps::OP_OR || ALUIROp == IR::IROps::OP_ANDWITHFLAGS)) { RoundedSize = ResultSize = GetGPROpSize(); LOGMAN_THROW_A_FMT(Const < (1ull << IR::OpSizeAsBits(Size)), "does not clobber"); // For AND, we can play the same trick but we instead need the upper bits of // the constant to be all-1s instead of all-0s to preserve. We also can't // use andwithflags in this case, since we've promoted to 64-bit so the // negate flag would be wrong, but using the regular logical operation path // instead still ends up a net win for uops. // // In the common case where the constant is of the form (1 << x) - 1, the // adjusted constant here will inline into the arm64 and instruction, so if // flags are not needed, we save an instruction overall. if (ALUIROp == IR::IROps::OP_ANDWITHFLAGS) { Src = Constant(Const | ~((1ull << IR::OpSizeAsBits(Size)) - 1)); ALUIROp = IR::IROps::OP_AND; } } Ref Result {}; Ref Dest {}; if (DestIsLockedMem(Op)) { HandledLock = true; Ref DestMem = MakeSegmentAddress(Op, Op->Dest); DeriveOp(FetchOp, AtomicFetchOp, _AtomicFetchAdd(Size, Src, DestMem)); Dest = FetchOp; } else { Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags, {.AllowUpperGarbage = true}); } const auto OpSize = RoundedSize; uint64_t Mask = Size == OpSize::i64Bit ? ~0ull : ((1ull << IR::OpSizeAsBits(Size)) - 1); if (IsConst && Const == Mask && !DestIsLockedMem(Op) && ALUIROp == IR::IROps::OP_XOR && Size >= OpSize::i32Bit) { Result = _Not(OpSize, Dest); } else if (IsConst && Const == Mask && !DestIsLockedMem(Op) && ALUIROp == IR::IROps::OP_AND) { Result = Dest; } else { DeriveOp(ALUOp, ALUIROp, _AndWithFlags(OpSize, Dest, Src)); Result = ALUOp; } // Flags set switch (ALUIROp) { case FEXCore::IR::IROps::OP_ADD: Result = CalculateFlags_ADD(Size, Dest, Src); break; case FEXCore::IR::IROps::OP_SUB: Result = CalculateFlags_SUB(Size, Dest, Src); break; case FEXCore::IR::IROps::OP_XOR: case FEXCore::IR::IROps::OP_AND: case FEXCore::IR::IROps::OP_OR: { CalculateFlags_Logical(Size, Result); break; } case FEXCore::IR::IROps::OP_ANDWITHFLAGS: { HandleNZ00Write(); CalculatePF(Result); InvalidateAF(); break; } default: break; } if (!DestIsLockedMem(Op)) { StoreResultGPR_WithOpSize(Op, Op->Dest, Result, ResultSize, OpSize::iInvalid, MemoryAccessType::DEFAULT); } } void OpDispatchBuilder::LSLOp(OpcodeArgs) { // Emulate by always returning failure, this deviates from both Linux and Windows but // shouldn't be depended on by anything. SetRFLAG(Constant(0)); } void OpDispatchBuilder::INTOp(OpcodeArgs) { IR::BreakDefinition Reason; bool SetRIPToNext = false; switch (Op->OP) { case 0xCD: { // INT imm8 uint8_t Literal = Op->Src[0].Literal(); #ifndef _WIN32 constexpr uint8_t SYSCALL_LITERAL = 0x80; if (Literal == SYSCALL_LITERAL) { if (Is64BitMode) [[unlikely]] { LogMan::Msg::EFmt("[Unsupported] Trying to execute 32-bit syscall from a 64-bit process."); UnhandledOp(Op); return; } // Syscall on linux SyscallOp(Op, false); return; } #else constexpr uint8_t SYSCALL_LITERAL = 0x2E; if (Literal == SYSCALL_LITERAL) { // Can be used for both 64-bit and 32-bit syscalls on windows SyscallOp(Op, false); return; } #endif #ifdef ARCHITECTURE_arm64ec // This is used when QueryPerformanceCounter is called on recent Windows versions, it causes CNTVCT to be written into RAX. constexpr uint8_t GET_CNTVCT_LITERAL = 0x81; if (Literal == GET_CNTVCT_LITERAL) { StoreGPRRegister(X86State::REG_RAX, _CycleCounter(false)); return; } #endif Reason.ErrorRegister = Literal << 3 | (0b010); Reason.Signal = Core::FAULT_SIGSEGV; // GP is raised when task-gate isn't setup to be valid Reason.TrapNumber = X86State::X86_TRAPNO_GP; Reason.si_code = 0x80; break; } case 0xCE: // INTO Reason.ErrorRegister = 0; Reason.Signal = Core::FAULT_SIGSEGV; Reason.TrapNumber = X86State::X86_TRAPNO_OF; Reason.si_code = 0x80; break; case 0xF1: // INT1 Reason.ErrorRegister = 0; Reason.Signal = Core::FAULT_SIGTRAP; Reason.TrapNumber = X86State::X86_TRAPNO_DB; Reason.si_code = 1; SetRIPToNext = true; break; case 0xF4: { // HLT Reason.ErrorRegister = 0; Reason.Signal = Core::FAULT_SIGSEGV; Reason.TrapNumber = X86State::X86_TRAPNO_GP; Reason.si_code = 0x80; break; } case 0x0B: // UD2 Reason.ErrorRegister = 0; Reason.Signal = Core::FAULT_SIGILL; Reason.TrapNumber = X86State::X86_TRAPNO_UD; Reason.si_code = 2; break; case 0xCC: // INT3 Reason.ErrorRegister = 0; Reason.Signal = Core::FAULT_SIGTRAP; Reason.TrapNumber = X86State::X86_TRAPNO_BP; Reason.si_code = 0x80; SetRIPToNext = true; break; default: FEX_UNREACHABLE; } // Calculate flags early. FlushRegisterCache(); const auto GPRSize = GetGPROpSize(); if (SetRIPToNext) { BlockSetRIP = SetRIPToNext; // We want to set RIP to the next instruction after INT3/INT1 auto NewRIP = GetRelocatedPC(Op); _StoreContextGPR(GPRSize, NewRIP, offsetof(FEXCore::Core::CPUState, rip)); } else if (Op->OP != 0xCE) { auto NewRIP = GetRelocatedPC(Op, -Op->InstSize); _StoreContextGPR(GPRSize, NewRIP, offsetof(FEXCore::Core::CPUState, rip)); } if (Op->OP == 0xCE) { // Conditional to only break if Overflow == 1 CalculateDeferredFlags(); // If condition doesn't hold then keep going // CondClass::FNU means OF == 0 auto CondJump_ = CondJumpNZCV(CondClass::FNU); auto FalseBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); SetFalseJumpTarget(CondJump_, FalseBlock); SetCurrentCodeBlock(FalseBlock); StartNewBlock(); auto NewRIP = GetRelocatedPC(Op); _StoreContextGPR(GPRSize, NewRIP, offsetof(FEXCore::Core::CPUState, rip)); Break(Reason); // Make sure to start a new block after ending this one auto JumpTarget = CreateNewCodeBlockAfter(FalseBlock); SetTrueJumpTarget(CondJump_, JumpTarget); SetCurrentCodeBlock(JumpTarget); StartNewBlock(); } else { BlockSetRIP = true; Break(Reason); } } void OpDispatchBuilder::TZCNT(OpcodeArgs) { // _FindTrailingZeroes ignores upper garbage so we don't need to mask Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); Src = _FindTrailingZeroes(OpSizeFromSrc(Op), Src); StoreResultGPR(Op, Src); CalculateFlags_ZCNT(OpSizeFromSrc(Op), Src); } void OpDispatchBuilder::LZCNT(OpcodeArgs) { // _CountLeadingZeroes clears upper garbage so we don't need to mask Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Res = _CountLeadingZeroes(OpSizeFromSrc(Op), Src); StoreResultGPR(Op, Res); CalculateFlags_ZCNT(OpSizeFromSrc(Op), Res); } void OpDispatchBuilder::MOVBEOp(OpcodeArgs) { const auto GPRSize = GetGPROpSize(); const auto SrcSize = OpSizeFromSrc(Op); Ref Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit}); if (DestIsMem(Op) || SrcSize != OpSize::i16Bit) { Src = _Rev(SrcSize, Src); StoreResultGPR(Op, Op->Dest, Src); } else { Src = _Rev(std::max(OpSize::i32Bit, SrcSize), Src); // 16-bit does an insert. // Rev of 16-bit value as 32-bit replaces the result in the upper 16-bits of the result. // bfxil the 16-bit result in to the GPR. Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, GPRSize, Op->Flags); auto Result = _Bfxil(GPRSize, 16, 16, Dest, Src); StoreResultGPR_WithOpSize(Op, Op->Dest, Result, GPRSize); } } void OpDispatchBuilder::CLWBOrTPause(OpcodeArgs) { if (DestIsMem(Op)) { Ref DestMem = MakeSegmentAddress(Op, Op->Dest); _CacheLineClean(DestMem); } else { if (!CTX->HostFeatures.SupportsWFXT) { UnimplementedOp(Op); } else { auto RAX = LoadGPRRegister(X86State::REG_RAX); auto RDX = LoadGPRRegister(X86State::REG_RDX); // Incoming source register is unused. _WFET(RDX, RAX); // OF, SF, ZF, AF, PF, CF all zero. // CF is used if the OS deadline is set, which we don't do anything with. ZeroPF_AF(); ZeroNZCV(); } } } void OpDispatchBuilder::CLFLUSHOPT(OpcodeArgs) { Ref DestMem = MakeSegmentAddress(Op, Op->Dest); _CacheLineClear(DestMem, false); } void OpDispatchBuilder::LoadFenceOrXRSTOR(OpcodeArgs) { // 0xE8 signifies LFENCE if (Op->ModRM == 0xE8) { _Fence(FenceType::Load); } else { XRstorOpImpl(Op); } } void OpDispatchBuilder::MemFenceOrXSAVEOPT(OpcodeArgs) { if (Op->ModRM == 0xF0) { // 0xF0 is MFENCE _Fence(FenceType::LoadStore); } else { XSaveOp(Op); } } void OpDispatchBuilder::StoreFenceOrCLFlush(OpcodeArgs) { if (Op->ModRM == 0xF8) { // 0xF8 is SFENCE _Fence(FenceType::Store); } else { // This is a CLFlush Ref DestMem = MakeSegmentAddress(Op, Op->Dest); _CacheLineClear(DestMem, true); } } void OpDispatchBuilder::UMonitorOrCLRSSBSY(OpcodeArgs) { if (DestIsMem(Op) || !CTX->HostFeatures.SupportsWFXT) { // CLRSSBSY UnimplementedOp(Op); } else { // Explicit NOP implementation of umonitor. } } void OpDispatchBuilder::UMWaitOp(OpcodeArgs) { if (DestIsMem(Op) || !CTX->HostFeatures.SupportsWFXT) { UnimplementedOp(Op); } else { // Explicit NOP implementation of umwait. // Still zero flags. // // OF, SF, ZF, AF, PF, CF all zero. ZeroPF_AF(); ZeroNZCV(); } } void OpDispatchBuilder::CLZeroOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsCLZERO) { UnimplementedOp(Op); return; } Ref DestMem = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.LoadData = false}); _CacheLineZero(DestMem); } void OpDispatchBuilder::Prefetch(OpcodeArgs, bool ForStore, bool Stream, uint8_t Level) { Ref DestMem = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.LoadData = false}); _Prefetch(ForStore, Stream, Level, DestMem, Invalid(), MemOffsetType::SXTX, 1); } void OpDispatchBuilder::RDTSCPOp(OpcodeArgs) { // RDTSCP is slightly different than RDTSC // IA32_TSC_AUX is returned in RCX // All previous loads are globally visible // - Explicitly does not wait for stores to be globally visible // - Explicitly use an MFENCE before this instruction if you want this behaviour // This instruction is not an execution fence, so subsequent instructions can execute after this // - Explicitly use an LFENCE after RDTSCP if you want to block this behaviour auto Counter = CycleCounter(true); auto ID = _ProcessorID(); StoreGPRRegister(X86State::REG_RAX, Counter.CounterLow); StoreGPRRegister(X86State::REG_RCX, ID); StoreGPRRegister(X86State::REG_RDX, Counter.CounterHigh); } void OpDispatchBuilder::RDPIDOp(OpcodeArgs) { StoreResultGPR(Op, _ProcessorID()); } void OpDispatchBuilder::CRC32(OpcodeArgs) { if (!CTX->HostFeatures.SupportsCRC) { UnimplementedOp(Op); return; } const auto GPRSize = GetGPROpSize(); // Destination GPR size is always 4 or 8 bytes depending on widening const auto DstSize = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REX_WIDENING ? OpSize::i64Bit : OpSize::i32Bit; Ref Dest = LoadSourceGPR_WithOpSize(Op, Op->Dest, GPRSize, Op->Flags); // Incoming memory is 8, 16, 32, or 64 Ref Src {}; if (Op->Src[0].IsGPR()) { Src = LoadSourceGPR_WithOpSize(Op, Op->Src[0], GPRSize, Op->Flags); } else { Src = LoadSourceGPR(Op, Op->Src[0], Op->Flags, {.Align = OpSize::i8Bit}); } auto Result = _CRC32(Dest, Src, OpSizeFromSrc(Op)); StoreResultGPR_WithOpSize(Op, Op->Dest, Result, DstSize); } template void OpDispatchBuilder::RDRANDOp(OpcodeArgs) { if (!CTX->HostFeatures.SupportsRAND) { UnimplementedOp(Op); return; } StoreResultGPR(Op, _RDRAND(Reseed)); // If the rng number is valid then NZCV is 0b0000, otherwise NZCV is 0b0100 auto CF_inv = GetRFLAG(X86State::RFLAG_ZF_RAW_LOC); // OF, SF, ZF, AF, PF all zero. CF indicates if valid. ZeroPF_AF(); if (!CTX->HostFeatures.SupportsFlagM) { ZeroNZCV(); SetCFInverted(CF_inv); } else { // Accelerated path. Invalid is 0 or 1, so set NZCV with a single rmif. HandleNZCVWrite(); _RmifNZCV(CF_inv, (64 - 1) /* rotate bit 0 into bit 1 = C */, 0xf); CFInverted = true; } } template void OpDispatchBuilder::RDRANDOp(OpcodeArgs); template void OpDispatchBuilder::RDRANDOp(OpcodeArgs); void OpDispatchBuilder::BreakOp(OpcodeArgs, FEXCore::IR::BreakDefinition BreakDefinition) { const auto GPRSize = GetGPROpSize(); // We don't actually support this instruction // Multiblock may hit it though _StoreContextGPR(GPRSize, GetRelocatedPC(Op, -Op->InstSize), offsetof(FEXCore::Core::CPUState, rip)); Break(BreakDefinition); if (Multiblock) { auto NextBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); SetCurrentCodeBlock(NextBlock); StartNewBlock(); } else { BlockSetRIP = true; } } void OpDispatchBuilder::UnimplementedOp(OpcodeArgs) { BreakOp(Op, FEXCore::IR::BreakDefinition { .ErrorRegister = 0, .Signal = SIGILL, .TrapNumber = X86State::X86_TRAPNO_UD, .si_code = 2, ///< ILL_ILLOPN }); } void OpDispatchBuilder::PermissionRestrictedOp(OpcodeArgs) { BreakOp(Op, FEXCore::IR::BreakDefinition { .ErrorRegister = 0, .Signal = SIGSEGV, .TrapNumber = X86State::X86_TRAPNO_GP, .si_code = 0x80, }); } void OpDispatchBuilder::InvalidOp(OpcodeArgs) { BreakOp(Op, FEXCore::IR::BreakDefinition { .ErrorRegister = 0, .Signal = SIGILL, .TrapNumber = 0, .si_code = 0, }); } void OpDispatchBuilder::NoExecOp(OpcodeArgs) { BreakOp(Op, FEXCore::IR::BreakDefinition { .ErrorRegister = X86State::X86_PF_PROT | X86State::X86_PF_USER | X86State::X86_PF_INSTR, .Signal = Core::FAULT_SIGSEGV, .TrapNumber = X86State::X86_TRAPNO_PF, .si_code = 2, // SEGV_ACCERR }); } #undef OpcodeArgs } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/OpcodeDispatcher.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/Core/Frontend.h" #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/Addressing.h" #include "Interface/Context/Context.h" #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/RegisterAllocationData.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::IR { enum class VectorCompareType { // SSE comparisons. EQ_OQ = 0, LT_OS = 1, LE_OS = 2, UNORD_Q = 3, NEQ_UQ = 4, NLT_US = 5, NLE_US = 6, ORD_Q = 7, // AVX-only comparisons. EQ_UQ = 8, NGE_US = 9, NGT_US = 10, FALSE_OQ = 11, NEQ_OQ = 12, GE_OS = 13, GT_OS = 14, TRUE_UQ = 15, EQ_OS = 16, LT_OQ = 17, LE_OQ = 18, UNORD_S = 19, NEQ_US = 20, NLT_UQ = 21, NLE_UQ = 22, ORD_S = 23, EQ_US = 24, NGE_UQ = 25, NGT_UQ = 26, FALSE_OS = 27, NEQ_OS = 28, GE_OQ = 29, GT_OQ = 30, TRUE_US = 31, }; enum class MemoryAccessType { // Choose TSO or Non-TSO depending on access type DEFAULT, // TSO access behaviour TSO, // Non-TSO access behaviour NONTSO, // Non-temporal streaming STREAM, }; enum class BTAction { BTNone, BTClear, BTSet, BTComplement, }; enum class ForceTSOMode { NoOverride, ForceDisabled, ForceEnabled, }; struct LoadSourceOptions { // Alignment of the load in bytes. iInvalid signifies opsize aligned. IR::OpSize Align = OpSize::iInvalid; // Whether or not to load the data if a memory access occurs. // If set to false, then the address that would have been loaded from // will be returned instead. // // Note: If returning the address, make sure to apply the segment offset // after with AppendSegmentOffset(). // bool LoadData = true; // Use to force a load even if the underlying type isn't loadable. bool ForceLoad = false; // Specifies the access type of the load. MemoryAccessType AccessType = MemoryAccessType::DEFAULT; // Whether or not a zero extend should clear the upper bits // in the register (e.g. an 8-bit load would clear the upper 24 bits // or 56 bits depending on the operating mode). // If true, no zero-extension occurs. bool AllowUpperGarbage = false; }; struct DispatchTableEntry { uint16_t Op; uint8_t Count; X86Tables::OpDispatchPtr Ptr; }; class OpDispatchBuilder final : public IREmitter { public: Ref GetNewJumpBlock(uint64_t RIP) { auto it = JumpTargets.find(RIP); LOGMAN_THROW_A_FMT(it != JumpTargets.end(), "Couldn't find block generated for 0x{:x}", RIP); return it->second.BlockEntry; } void SetNewBlockIfChanged(uint64_t RIP) { auto it = JumpTargets.find(RIP); if (it == JumpTargets.end()) { return; } it->second.HaveEmitted = true; if (CurrentCodeBlock->Wrapped(DualListData.ListBegin()).ID() == it->second.BlockEntry->Wrapped(DualListData.ListBegin()).ID()) { return; } // We have hit a RIP that is a jump target // Thus we need to end up in a new block SetCurrentCodeBlock(it->second.BlockEntry); } void StartNewBlock() { // If we loaded flags but didn't change them, invalidate the cached copy and move on. // Changes get stored out by CalculateDeferredFlags. CachedNZCV = nullptr; CFInverted = CFInvertedABI; FlushRegisterCache(); // Start block in X87 state. // This is important to ensure that blocks always start with the same state independently of predecessors // which allows independent compilation of blocks. // Starting in the X87 state is better than starting in MMX state because // MMX state is more work to initialize. MMXState = MMXState_X87; // New block needs to reset segment telemetry. SegmentsNeedReadCheck = ~0U; // Need to clear any named constants that were cached. ClearCachedNamedConstants(); } IRPair Jump() { FlushRegisterCache(); return _Jump(); } IRPair Jump(Ref _TargetBlock) { FlushRegisterCache(); return _Jump(_TargetBlock); } IRPair CondJump(Ref _Cmp1, Ref _Cmp2, Ref _TrueBlock, Ref _FalseBlock, CondClass _Cond = CondClass::NEQ, IR::OpSize _CompareSize = OpSize::iInvalid) { FlushRegisterCache(); return _CondJump(_Cmp1, _Cmp2, _TrueBlock, _FalseBlock, _Cond, _CompareSize); } IRPair CondJump(Ref ssa0, CondClass cond = CondClass::NEQ) { FlushRegisterCache(); return _CondJump(ssa0, cond); } IRPair CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClass cond = CondClass::NEQ) { FlushRegisterCache(); return _CondJump(ssa0, ssa1, ssa2, cond); } IRPair CondJumpNZCV(CondClass Cond) { FlushRegisterCache(); return _CondJump(InvalidNode, InvalidNode, InvalidNode, InvalidNode, Cond, OpSize::iInvalid, true); } IRPair CondJumpBit(Ref Src, unsigned Bit, bool Set) { FlushRegisterCache(); auto InlineConst = _InlineConstant(Bit); auto Cond = Set ? CondClass::TSTNZ : CondClass::TSTZ; return _CondJump(Src, InlineConst, InvalidNode, InvalidNode, Cond, OpSize::iInvalid, false); } IRPair ExitFunction(Ref NewRIP, BranchHint Hint = BranchHint::None) { FlushRegisterCache(); return _ExitFunction(GetOpSize(NewRIP), NewRIP, Hint, InvalidNode, InvalidNode); } IRPair ExitFunction(Ref NewRIP, BranchHint Hint, Ref CallReturnAddress, Ref CallReturnBlock) { FlushRegisterCache(); return _ExitFunction(GetOpSize(NewRIP), NewRIP, Hint, CallReturnAddress, CallReturnBlock); } IRPair Break(BreakDefinition Reason) { FlushRegisterCache(); return _Break(Reason); } IRPair Thunk(Ref ArgPtr, SHA256Sum ThunkNameHash) { FlushRegisterCache(); return _Thunk(ArgPtr, ThunkNameHash); } bool FinishOp(uint64_t NextRIP, bool LastOp) { // If we are switching to a new block and this current block has yet to set a RIP // Then we need to insert an unconditional jump from the current block to the one we are going to // This happens most frequently when an instruction jumps backwards to another location // eg: // // nop dword [rax], eax // .label: // rdi, 0x8 // cmp qword [rdi-8], 0 // jne .label if (LastOp && !BlockSetRIP) { auto it = JumpTargets.find(NextRIP); if (it == JumpTargets.end()) { const auto GPRSize = GetGPROpSize(); // If we don't have a jump target to a new block then we have to leave // Set the RIP to the next instruction and leave ExitFunction(_InlineEntrypointOffset(GPRSize, NextRIP - Entry)); } else if (it != JumpTargets.end()) { Jump(it->second.BlockEntry); return true; } } BlockSetRIP = false; return false; } static bool CanHaveSideEffects(const FEXCore::X86Tables::X86InstInfo* TableInfo, FEXCore::X86Tables::DecodedOp Op) { if (TableInfo) { if (TableInfo->Flags & X86Tables::InstFlags::FLAGS_DEBUG_MEM_ACCESS) { // If it is marked as having memory access then always say it has a side-effect. // Not always true but better to be safe. return true; } if (TableInfo->Flags & (X86Tables::InstFlags::FLAGS_SETS_RIP | X86Tables::InstFlags::FLAGS_BLOCK_END)) { // Cooperative suspend interrupts can be triggered at any back-edge, the RIP must be reconstructed correctly in such cases return true; } } auto CanHaveSideEffects = false; auto HasPotentialMemoryAccess = [](const X86Tables::DecodedOperand& Operand) -> bool { if (Operand.IsNone()) { return false; } // This isn't guaranteed that all of these types will access memory, but be safe. return Operand.IsGPRDirect() || Operand.IsGPRIndirect() || Operand.IsRIPRelative() || Operand.IsSIB(); }; CanHaveSideEffects |= HasPotentialMemoryAccess(Op->Dest); CanHaveSideEffects |= HasPotentialMemoryAccess(Op->Src[0]); CanHaveSideEffects |= HasPotentialMemoryAccess(Op->Src[1]); CanHaveSideEffects |= HasPotentialMemoryAccess(Op->Src[2]); return CanHaveSideEffects; } template void ForeachDirection(F&& Routine) { // Otherwise, prepare to branch. auto Zero = Constant(0); // If the shift is zero, do not touch the flags. auto ForwardBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); auto BackwardBlock = CreateNewCodeBlockAfter(ForwardBlock); auto ExitBlock = CreateNewCodeBlockAfter(BackwardBlock); auto DF = GetRFLAG(X86State::RFLAG_DF_RAW_LOC); CondJump(DF, Zero, ForwardBlock, BackwardBlock, CondClass::EQ); for (auto D = 0; D < 2; ++D) { SetCurrentCodeBlock(D ? BackwardBlock : ForwardBlock); StartNewBlock(); { Routine(D ? -1 : 1); Jump(ExitBlock); } } SetCurrentCodeBlock(ExitBlock); StartNewBlock(); } OpDispatchBuilder(FEXCore::Context::ContextImpl* ctx); // Should only be called at the start of IR Emission. void ResetWorkingList(); void ResetDecodeFailure() { NeedsBlockEnd = DecodeFailure = false; } bool HadDecodeFailure() const { return DecodeFailure; } bool NeedsBlockEnder() const { return NeedsBlockEnd; } void ResetHandledLock() { HandledLock = false; } bool HasHandledLock() const { return HandledLock; } void SetForceTSO(ForceTSOMode Mode) { ForceTSO = Mode; } ForceTSOMode GetForceTSO() const { return ForceTSO; } void SetDumpIR(bool DumpIR) { ShouldDump = DumpIR; } bool ShouldDumpIR() const { return ShouldDump; } void BeginFunction(uint64_t RIP, const fextl::vector* Blocks, uint32_t NumInstructions, bool Is64BitMode, bool MonoBackpatcherBlock); void Finalize(); // Dispatch builder functions #define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op /** * Binds a sequence of compile-time constants as arguments to another member function. * This allows to construct OpDispatchPtrs that are specialized for the given set of arguments. */ template void Bind(OpcodeArgs) { [[clang::noinline]] (this->*Fn)(Op, Args...); }; void UnhandledOp(OpcodeArgs); void MOVGPROp(OpcodeArgs, uint32_t SrcIndex); void MOVGPRNTOp(OpcodeArgs); void MOVVectorAlignedOp(OpcodeArgs); void MOVVectorUnalignedOp(OpcodeArgs); void MOVVectorNTOp(OpcodeArgs); void ALUOp(OpcodeArgs, FEXCore::IR::IROps ALUIROp, FEXCore::IR::IROps AtomicFetchOp, unsigned SrcIdx); void LSLOp(OpcodeArgs); void INTOp(OpcodeArgs); void SyscallOp(OpcodeArgs, bool IsSyscallInst); void ThunkOp(OpcodeArgs); void LEAOp(OpcodeArgs); void NOPOp(OpcodeArgs); void RETOp(OpcodeArgs); void IRETOp(OpcodeArgs); void CallbackReturnOp(OpcodeArgs); void SecondaryALUOp(OpcodeArgs); void ADCOp(OpcodeArgs, uint32_t SrcIndex); void SBBOp(OpcodeArgs, uint32_t SrcIndex); void SALCOp(OpcodeArgs); void PUSHOp(OpcodeArgs); void PUSHREGOp(OpcodeArgs); void PUSHAOp(OpcodeArgs); void PUSHSegmentOp(OpcodeArgs, uint32_t SegmentReg); void POPOp(OpcodeArgs); void POPAOp(OpcodeArgs); void POPSegmentOp(OpcodeArgs, uint32_t SegmentReg); void LEAVEOp(OpcodeArgs); void CALLOp(OpcodeArgs); void CALLAbsoluteOp(OpcodeArgs); void CondJUMPOp(OpcodeArgs); void CondJUMPRCXOp(OpcodeArgs); void LoopOp(OpcodeArgs); void JUMPOp(OpcodeArgs); void JUMPAbsoluteOp(OpcodeArgs); void JUMPFARIndirectOp(OpcodeArgs); void CALLFARIndirectOp(OpcodeArgs); void RETFARIndirectOp(OpcodeArgs); void TESTOp(OpcodeArgs, uint32_t SrcIndex); void ARPLOp(OpcodeArgs); void MOVSXDOp(OpcodeArgs); void MOVSXOp(OpcodeArgs); void MOVZXOp(OpcodeArgs); void CMPOp(OpcodeArgs, uint32_t SrcIndex); void SETccOp(OpcodeArgs); void CQOOp(OpcodeArgs); void CDQOp(OpcodeArgs); void XCHGOp(OpcodeArgs); void SAHFOp(OpcodeArgs); void LAHFOp(OpcodeArgs); void MOVSegOp(OpcodeArgs, bool ToSeg); void FLAGControlOp(OpcodeArgs); void MOVOffsetOp(OpcodeArgs); void CMOVOp(OpcodeArgs); void CPUIDOp(OpcodeArgs); void XGetBVOp(OpcodeArgs); uint32_t GetConstantShift(X86Tables::DecodedOp Op, bool Is1Bit); void SHLOp(OpcodeArgs); void SHLImmediateOp(OpcodeArgs, bool SHL1Bit); void SHROp(OpcodeArgs); void SHRImmediateOp(OpcodeArgs, bool SHR1Bit); void SHLDOp(OpcodeArgs); void SHLDImmediateOp(OpcodeArgs); void SHRDOp(OpcodeArgs); void SHRDImmediateOp(OpcodeArgs); void ASHROp(OpcodeArgs, bool IsImmediate, bool Is1Bit); void RotateOp(OpcodeArgs, bool Left, bool IsImmediate, bool Is1Bit); void RCROp1Bit(OpcodeArgs); void RCROp8x1Bit(OpcodeArgs); void RCROp(OpcodeArgs); void RCRSmallerOp(OpcodeArgs); void RCLOp1Bit(OpcodeArgs); void RCLOp(OpcodeArgs); void RCLSmallerOp(OpcodeArgs); void BTOp(OpcodeArgs, uint32_t SrcIndex, enum BTAction Action); void IMUL1SrcOp(OpcodeArgs); void IMUL2SrcOp(OpcodeArgs); void IMULOp(OpcodeArgs); void STOSOp(OpcodeArgs); void MOVSOp(OpcodeArgs); void CMPSOp(OpcodeArgs); void LODSOp(OpcodeArgs); void SCASOp(OpcodeArgs); void BSWAPOp(OpcodeArgs); void PUSHFOp(OpcodeArgs); void POPFOp(OpcodeArgs); struct CycleCounterPair { Ref CounterLow; Ref CounterHigh; }; CycleCounterPair CycleCounter(bool SelfSynchronizingLoads); void RDTSCOp(OpcodeArgs); void INCOp(OpcodeArgs); void DECOp(OpcodeArgs); void NEGOp(OpcodeArgs); void DIVOp(OpcodeArgs); void IDIVOp(OpcodeArgs); void BSFOp(OpcodeArgs); void BSROp(OpcodeArgs); void CMPXCHGOp(OpcodeArgs); void CMPXCHGPairOp(OpcodeArgs); void MULOp(OpcodeArgs); void NOTOp(OpcodeArgs); void XADDOp(OpcodeArgs); void PopcountOp(OpcodeArgs); void DAAOp(OpcodeArgs); void DASOp(OpcodeArgs); void AAAOp(OpcodeArgs); void AASOp(OpcodeArgs); void AAMOp(OpcodeArgs); void AADOp(OpcodeArgs); void XLATOp(OpcodeArgs); template void RDRANDOp(OpcodeArgs); enum class Segment { FS, GS, }; void ReadSegmentReg(OpcodeArgs, Segment Seg); void WriteSegmentReg(OpcodeArgs, Segment Seg); void EnterOp(OpcodeArgs); void SGDTOp(OpcodeArgs); void SIDTOp(OpcodeArgs); void SMSWOp(OpcodeArgs); enum class VectorOpType { MMX, SSE, AVX, }; // SSE void MOVLPOp(OpcodeArgs); void MOVHPDOp(OpcodeArgs); void MOVSDOp(OpcodeArgs); void MOVSSOp(OpcodeArgs); void VectorALUOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void VectorXOROp(OpcodeArgs); void VectorALUROp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void VectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void RSqrt3DNowOp(OpcodeArgs, bool Duplicate); template void VectorUnaryDuplicateOp(OpcodeArgs); void MOVQOp(OpcodeArgs, VectorOpType VectorType); void MOVQMMXOp(OpcodeArgs); void MOVMSKOp(OpcodeArgs, IR::OpSize ElementSize); void MOVMSKOpOne(OpcodeArgs); void PUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize); void PUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize); void PSHUFBOp(OpcodeArgs); Ref PShufWLane(IR::OpSize Size, FEXCore::IR::IndexNamedVectorConstant IndexConstant, bool LowLane, Ref IncomingLane, uint8_t Shuffle); void PSHUFWOp(OpcodeArgs, bool Low); void PSHUFW8ByteOp(OpcodeArgs); void PSHUFDOp(OpcodeArgs); void PSRLDOp(OpcodeArgs, IR::OpSize ElementSize); void PSRLI(OpcodeArgs, IR::OpSize ElementSize); void PSLLI(OpcodeArgs, IR::OpSize ElementSize); void PSLL(OpcodeArgs, IR::OpSize ElementSize); void PSRAOp(OpcodeArgs, IR::OpSize ElementSize); void PSRLDQ(OpcodeArgs); void PSLLDQ(OpcodeArgs); void PSRAIOp(OpcodeArgs, IR::OpSize ElementSize); void MOVDDUPOp(OpcodeArgs); template void CVTGPR_To_FPR(OpcodeArgs); template void CVTFPR_To_GPR(OpcodeArgs); template void Vector_CVT_Int_To_Float(OpcodeArgs); template void Scalar_CVT_Float_To_Float(OpcodeArgs); void Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, bool IsAVX); template void Vector_CVT_Float_To_Int(OpcodeArgs); void MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs); template void XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); void MASKMOVOp(OpcodeArgs); void MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType); void TZCNT(OpcodeArgs); void LZCNT(OpcodeArgs); template void VFCMPOp(OpcodeArgs); void SHUFOp(OpcodeArgs, IR::OpSize ElementSize); template void PINSROp(OpcodeArgs); void InsertPSOp(OpcodeArgs); void PExtrOp(OpcodeArgs, IR::OpSize ElementSize); template void PSIGN(OpcodeArgs); template void VPSIGN(OpcodeArgs); // BMI1 Ops void ANDNBMIOp(OpcodeArgs); void BEXTRBMIOp(OpcodeArgs); void BLSIBMIOp(OpcodeArgs); void BLSMSKBMIOp(OpcodeArgs); void BLSRBMIOp(OpcodeArgs); // BMI2 Ops void BMI2Shift(OpcodeArgs); void BZHI(OpcodeArgs); void MULX(OpcodeArgs); void PDEP(OpcodeArgs); void PEXT(OpcodeArgs); void RORX(OpcodeArgs); // ADX Ops void ADXOp(OpcodeArgs); // AVX Ops void AVXVectorXOROp(OpcodeArgs); template void AVXVectorRound(OpcodeArgs); template void AVXScalar_CVT_Float_To_Float(OpcodeArgs); template void VectorScalarInsertALUOp(OpcodeArgs); template void AVXVectorScalarInsertALUOp(OpcodeArgs); template void VectorScalarUnaryInsertALUOp(OpcodeArgs); template void AVXVectorScalarUnaryInsertALUOp(OpcodeArgs); void InsertMMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs); template void InsertCVTGPR_To_FPR(OpcodeArgs); template void AVXInsertCVTGPR_To_FPR(OpcodeArgs); template void InsertScalar_CVT_Float_To_Float(OpcodeArgs); template void AVXInsertScalar_CVT_Float_To_Float(OpcodeArgs); RoundMode TranslateRoundType(uint8_t Mode); template void InsertScalarRound(OpcodeArgs); template void AVXInsertScalarRound(OpcodeArgs); template void InsertScalarFCMPOp(OpcodeArgs); template void AVXInsertScalarFCMPOp(OpcodeArgs); template void AVXCVTGPR_To_FPR(OpcodeArgs); template void AVXVFCMPOp(OpcodeArgs); template void VADDSUBPOp(OpcodeArgs); void VAESDecOp(OpcodeArgs); void VAESDecLastOp(OpcodeArgs); void VAESEncOp(OpcodeArgs); void VAESEncLastOp(OpcodeArgs); void VANDNOp(OpcodeArgs); Ref VBLENDOpImpl(IR::OpSize VecSize, IR::OpSize ElementSize, Ref Src1, Ref Src2, Ref ZeroRegister, uint64_t Selector); void VBLENDPDOp(OpcodeArgs); void VPBLENDDOp(OpcodeArgs); void VPBLENDWOp(OpcodeArgs); void VBROADCASTOp(OpcodeArgs, IR::OpSize ElementSize); template void VDPPOp(OpcodeArgs); void VEXTRACT128Op(OpcodeArgs); template void VHADDPOp(OpcodeArgs); void VHSUBPOp(OpcodeArgs, IR::OpSize ElementSize); void VINSERTOp(OpcodeArgs); void VINSERTPSOp(OpcodeArgs); template void VMASKMOVOp(OpcodeArgs); void VMOVHPOp(OpcodeArgs); void VMOVLPOp(OpcodeArgs); void VMOVDDUPOp(OpcodeArgs); void VMOVSHDUPOp(OpcodeArgs); void VMOVSLDUPOp(OpcodeArgs); void VMOVSDOp(OpcodeArgs); void VMOVSSOp(OpcodeArgs); void VMOVAPS_VMOVAPDOp(OpcodeArgs); void VMOVUPS_VMOVUPDOp(OpcodeArgs); void VMPSADBWOp(OpcodeArgs); void VPACKSSOp(OpcodeArgs, IR::OpSize ElementSize); void VPACKUSOp(OpcodeArgs, IR::OpSize ElementSize); void VPALIGNROp(OpcodeArgs); void VPCMPESTRIOp(OpcodeArgs); void VPCMPESTRMOp(OpcodeArgs); void VPCMPISTRIOp(OpcodeArgs); void VPCMPISTRMOp(OpcodeArgs); void VCVTPH2PSOp(OpcodeArgs); void VCVTPS2PHOp(OpcodeArgs); Ref VPERMDIndices(OpSize DstSize, Ref Indices, Ref IndexMask, Ref Repeating3210); void VPERM2Op(OpcodeArgs); void VPERMDOp(OpcodeArgs); void VPERMQOp(OpcodeArgs); void VPERMILImmOp(OpcodeArgs, IR::OpSize ElementSize); Ref VPERMILRegOpImpl(OpSize DstSize, IR::OpSize ElementSize, Ref Src, Ref Indices); template void VPERMILRegOp(OpcodeArgs); void VPHADDSWOp(OpcodeArgs); void VPHSUBOp(OpcodeArgs, IR::OpSize ElementSize); void VPHSUBSWOp(OpcodeArgs); void VPINSRBOp(OpcodeArgs); void VPINSRDQOp(OpcodeArgs); void VPINSRWOp(OpcodeArgs); void VPMADDUBSWOp(OpcodeArgs); void VPMADDWDOp(OpcodeArgs); template void VPMASKMOVOp(OpcodeArgs); void VPMULHRSWOp(OpcodeArgs); template void VPMULHWOp(OpcodeArgs); template void VPMULLOp(OpcodeArgs); void VPSADBWOp(OpcodeArgs); void VPSHUFBOp(OpcodeArgs); void VPSHUFWOp(OpcodeArgs, IR::OpSize ElementSize, bool Low); void VPSLLOp(OpcodeArgs, IR::OpSize ElementSize); void VPSLLDQOp(OpcodeArgs); void VPSLLIOp(OpcodeArgs, IR::OpSize ElementSize); void VPSLLVOp(OpcodeArgs); void VPSRAOp(OpcodeArgs, IR::OpSize ElementSize); void VPSRAIOp(OpcodeArgs, IR::OpSize ElementSize); void VPSRAVDOp(OpcodeArgs); void VPSRLVOp(OpcodeArgs); void VPSRLDOp(OpcodeArgs, IR::OpSize ElementSize); void VPSRLDQOp(OpcodeArgs); void VPUNPCKHOp(OpcodeArgs, IR::OpSize ElementSize); void VPUNPCKLOp(OpcodeArgs, IR::OpSize ElementSize); void VPSRLIOp(OpcodeArgs, IR::OpSize ElementSize); void VSHUFOp(OpcodeArgs, IR::OpSize ElementSize); template void VTESTPOp(OpcodeArgs); void VZEROOp(OpcodeArgs); // X87 Ops Ref ReconstructFSW_Helper(Ref T = nullptr); // Returns new x87 stack top from FSW. Ref ReconstructX87StateFromFSW_Helper(Ref FSW); void FLD(OpcodeArgs, IR::OpSize Width); void FLDFromStack(OpcodeArgs); void FLD_Const(OpcodeArgs, NamedVectorConstant K); void FBLD(OpcodeArgs); void FBSTP(OpcodeArgs); void FILD(OpcodeArgs); void FST(OpcodeArgs, IR::OpSize Width); void FSTToStack(OpcodeArgs); void FIST(OpcodeArgs, bool Truncate); // OpResult is used for Stack operations, // describes if the result of the operation is stored in ST(0) or ST(i), // where ST(i) is one of the arguments to the operation. enum class OpResult { RES_ST0, RES_STI, }; void FADD(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0); void FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0); void FMUL(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0); void FNCLEX(OpcodeArgs); void FNINIT(OpcodeArgs); void FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0); void FTST(OpcodeArgs); void FXCH(OpcodeArgs); void X87EMMS(OpcodeArgs); void X87FCMOV(OpcodeArgs); void X87FFREE(OpcodeArgs); void X87FLDCW(OpcodeArgs); void X87FNSAVE(OpcodeArgs); void X87FNSTENV(OpcodeArgs); void X87FNSTSW(OpcodeArgs); void X87FRSTOR(OpcodeArgs); void X87FSTCW(OpcodeArgs); void X87FXAM(OpcodeArgs); void X87FXTRACT(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); void X87OpHelper(OpcodeArgs, FEXCore::IR::IROps IROp, bool ZeroC2); enum class FCOMIFlags { FLAGS_X87, FLAGS_RFLAGS, }; void FCOMI(OpcodeArgs, IR::OpSize Width, bool Integer, FCOMIFlags WhichFlags, bool PopTwice); // F64 X87 Ops void FADDF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0); void FBLDF64(OpcodeArgs); void FBSTPF64(OpcodeArgs); void FCOMIF64(OpcodeArgs, IR::OpSize width, bool Integer, FCOMIFlags whichflags, bool poptwice); void FDIVF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0); void FILDF64(OpcodeArgs); void FISTF64(OpcodeArgs, bool Truncate); void FLDF64_Const(OpcodeArgs, uint64_t Num); void FLDF64(OpcodeArgs, IR::OpSize Width); void FMULF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0); void FSUBF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0); void FTSTF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87FXTRACTF64(OpcodeArgs); void X87LDENVF64(OpcodeArgs); void FXSaveOp(OpcodeArgs); void FXRStoreOp(OpcodeArgs); Ref XSaveBase(X86Tables::DecodedOp Op); void XSaveOp(OpcodeArgs); void PAlignrOp(OpcodeArgs); template void UCOMISxOp(OpcodeArgs); void LDMXCSR(OpcodeArgs); void STMXCSR(OpcodeArgs); template void PACKUSOp(OpcodeArgs); template void PACKSSOp(OpcodeArgs); template void PMULLOp(OpcodeArgs); template void MOVQ2DQ(OpcodeArgs); template void ADDSUBPOp(OpcodeArgs); void PFNACCOp(OpcodeArgs); void PFPNACCOp(OpcodeArgs); void PSWAPDOp(OpcodeArgs); template void VPFCMPOp(OpcodeArgs); void PI2FWOp(OpcodeArgs); void PF2IWOp(OpcodeArgs); void PMULHRWOp(OpcodeArgs); void PMADDWD(OpcodeArgs); void PMADDUBSW(OpcodeArgs); template void PMULHW(OpcodeArgs); void PMULHRSW(OpcodeArgs); void MOVBEOp(OpcodeArgs); template void HSUBP(OpcodeArgs); template void PHSUB(OpcodeArgs); void PHADDS(OpcodeArgs); void PHSUBS(OpcodeArgs); void CLWBOrTPause(OpcodeArgs); void CLFLUSHOPT(OpcodeArgs); void LoadFenceOrXRSTOR(OpcodeArgs); void MemFenceOrXSAVEOPT(OpcodeArgs); void StoreFenceOrCLFlush(OpcodeArgs); void UMonitorOrCLRSSBSY(OpcodeArgs); void UMWaitOp(OpcodeArgs); void CLZeroOp(OpcodeArgs); void RDTSCPOp(OpcodeArgs); void RDPIDOp(OpcodeArgs); void Prefetch(OpcodeArgs, bool ForStore, bool Stream, uint8_t Level); void PSADBW(OpcodeArgs); void SHA1NEXTEOp(OpcodeArgs); void SHA1MSG1Op(OpcodeArgs); void SHA1MSG2Op(OpcodeArgs); void SHA1RNDS4Op(OpcodeArgs); void SHA256MSG1Op(OpcodeArgs); void SHA256MSG2Op(OpcodeArgs); void SHA256RNDS2Op(OpcodeArgs); void AESImcOp(OpcodeArgs); void AESEncOp(OpcodeArgs); void AESEncLastOp(OpcodeArgs); void AESDecOp(OpcodeArgs); void AESDecLastOp(OpcodeArgs); void AESKeyGenAssist(OpcodeArgs); void VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx); void VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx); struct RefVSIB { Ref Low, High; Ref BaseAddr; int32_t Displacement; uint8_t Scale; }; RefVSIB LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags); template void VPGATHER(OpcodeArgs); template void ExtendVectorElements(OpcodeArgs); template void VectorRound(OpcodeArgs); Ref VectorBlend(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t Selector); template void VectorBlend(OpcodeArgs); void VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize); void PTestOpImpl(OpSize Size, Ref Dest, Ref Src); void PTestOp(OpcodeArgs); void PHMINPOSUWOp(OpcodeArgs); template void DPPOp(OpcodeArgs); void MPSADBWOp(OpcodeArgs); void PCLMULQDQOp(OpcodeArgs); void VPCLMULQDQOp(OpcodeArgs); void CRC32(OpcodeArgs); void Extrq_imm(OpcodeArgs); void Insertq_imm(OpcodeArgs); void Extrq(OpcodeArgs); void Insertq(OpcodeArgs); void BreakOp(OpcodeArgs, FEXCore::IR::BreakDefinition BreakDefinition); void UnimplementedOp(OpcodeArgs); void PermissionRestrictedOp(OpcodeArgs); ///< Helper for PSHUD and VPERMILPS(imm) since they are the same instruction Ref Single128Bit4ByteVectorShuffle(Ref Src, uint8_t Shuffle); // AVX 128-bit operations Ref AVX128_LoadXMMRegister(uint32_t XMM, bool High); void AVX128_StoreXMMRegister(uint32_t XMM, const Ref Src, bool High); struct RefPair { Ref Low, High; }; RefPair AVX128_Zext(Ref R) { RefPair Pair; Pair.Low = R; Pair.High = LoadZeroVector(OpSize::i128Bit); return Pair; } Ref SHADataShuffle(Ref Src) { // SHA data shuffle matches PSHUFD shuffle where elements are inverted. // Because this shuffle mask gets reused multiple times per instruction, it's always a win to load the mask once and reuse it. const uint32_t Shuffle = 0b00'01'10'11; auto LookupIndexes = LoadAndCacheIndexedNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD, Shuffle * 16); return _VTBL1(OpSize::i128Bit, Src, LookupIndexes); } RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT); RefVSIB AVX128_LoadVSIB(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh); void AVX128_StoreResult_WithOpSize(FEXCore::X86Tables::DecodedOp Op, const FEXCore::X86Tables::DecodedOperand& Operand, const RefPair Src, MemoryAccessType AccessType = MemoryAccessType::DEFAULT); void AVX128_VMOVScalarImpl(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VectorALU(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVX128_VectorUnary(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVX128_VectorUnaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper); void AVX128_VectorBinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, std::function Helper); void AVX128_VectorShiftWideImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp); void AVX128_VectorShiftImmImpl(OpcodeArgs, IR::OpSize ElementSize, IROps IROp); void AVX128_VectorTrinaryImpl(OpcodeArgs, IR::OpSize SrcSize, IR::OpSize ElementSize, Ref Src3, std::function Helper); enum class ShiftDirection { RIGHT, LEFT }; void AVX128_ShiftDoubleImm(OpcodeArgs, ShiftDirection Dir); void AVX128_VMOVAPS(OpcodeArgs); void AVX128_VMOVSD(OpcodeArgs); void AVX128_VMOVSS(OpcodeArgs); void AVX128_VectorXOR(OpcodeArgs); void AVX128_VZERO(OpcodeArgs); void AVX128_MOVVectorNT(OpcodeArgs); void AVX128_MOVQ(OpcodeArgs); void AVX128_VMOVLP(OpcodeArgs); void AVX128_VMOVHP(OpcodeArgs); void AVX128_VMOVDDUP(OpcodeArgs); void AVX128_VMOVSLDUP(OpcodeArgs); void AVX128_VMOVSHDUP(OpcodeArgs); void AVX128_VBROADCAST(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPUNPCKL(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPUNPCKH(OpcodeArgs, IR::OpSize ElementSize); void AVX128_MOVVectorUnaligned(OpcodeArgs); void AVX128_InsertCVTGPR_To_FPR(OpcodeArgs, IR::OpSize DstElementSize); void AVX128_CVTFPR_To_GPR(OpcodeArgs, IR::OpSize SrcElementSize, bool HostRoundingMode); void AVX128_VANDN(OpcodeArgs); void AVX128_VPACKSS(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPACKUS(OpcodeArgs, IR::OpSize ElementSize); Ref AVX128_PSIGNImpl(IR::OpSize ElementSize, Ref Src1, Ref Src2); void AVX128_VPSIGN(OpcodeArgs, IR::OpSize ElementSize); void AVX128_UCOMISx(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VectorScalarInsertALU(OpcodeArgs, FEXCore::IR::IROps IROp, IR::OpSize ElementSize); void AVX128_VFCMP(OpcodeArgs, IR::OpSize ElementSize); void AVX128_InsertScalarFCMP(OpcodeArgs, IR::OpSize ElementSize); void AVX128_MOVBetweenGPR_FPR(OpcodeArgs); void AVX128_PExtr(OpcodeArgs, IR::OpSize ElementSize); void AVX128_ExtendVectorElements(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed); void AVX128_MOVMSK(OpcodeArgs, IR::OpSize ElementSize); void AVX128_MOVMSKB(OpcodeArgs); void AVX128_PINSRImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm); void AVX128_VPINSRB(OpcodeArgs); void AVX128_VPINSRW(OpcodeArgs); void AVX128_VPINSRDQ(OpcodeArgs); void AVX128_VariableShiftImpl(OpcodeArgs, IROps IROp); void AVX128_VINSERT(OpcodeArgs); void AVX128_VINSERTPS(OpcodeArgs); void AVX128_VPHSUB(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPHSUBSW(OpcodeArgs); void AVX128_VADDSUBP(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPMULL(OpcodeArgs, IR::OpSize ElementSize, bool Signed); void AVX128_VPMULHRSW(OpcodeArgs); void AVX128_VPMULHW(OpcodeArgs, bool Signed); void AVX128_InsertScalar_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize); void AVX128_Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize); void AVX128_Vector_CVT_Float_To_Int(OpcodeArgs, IR::OpSize SrcElementSize, bool HostRoundingMode); void AVX128_Vector_CVT_Int_To_Float(OpcodeArgs, IR::OpSize SrcElementSize, bool Widen); void AVX128_VEXTRACT128(OpcodeArgs); void AVX128_VAESImc(OpcodeArgs); void AVX128_VAESEnc(OpcodeArgs); void AVX128_VAESEncLast(OpcodeArgs); void AVX128_VAESDec(OpcodeArgs); void AVX128_VAESDecLast(OpcodeArgs); void AVX128_VAESKeyGenAssist(OpcodeArgs); void AVX128_VPCMPESTRI(OpcodeArgs); void AVX128_VPCMPESTRM(OpcodeArgs); void AVX128_VPCMPISTRI(OpcodeArgs); void AVX128_VPCMPISTRM(OpcodeArgs); void AVX128_PHMINPOSUW(OpcodeArgs); void AVX128_VectorRound(OpcodeArgs, IR::OpSize ElementSize); void AVX128_InsertScalarRound(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VDPP(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPERMQ(OpcodeArgs); void AVX128_VPSHUFW(OpcodeArgs, bool Low); void AVX128_VSHUF(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPERMILImm(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VHADDP(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVX128_VPHADDSW(OpcodeArgs); void AVX128_VPMADDUBSW(OpcodeArgs); void AVX128_VPMADDWD(OpcodeArgs); void AVX128_VBLEND(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VHSUBP(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPSHUFB(OpcodeArgs); void AVX128_VPSADBW(OpcodeArgs); void AVX128_VMPSADBW(OpcodeArgs); void AVX128_VPALIGNR(OpcodeArgs); void AVX128_VMASKMOVImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp); void AVX128_VPMASKMOV(OpcodeArgs, bool IsStore); void AVX128_VMASKMOV(OpcodeArgs, IR::OpSize ElementSize, bool IsStore); void AVX128_MASKMOV(OpcodeArgs); void AVX128_VectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize); void AVX128_SaveAVXState(Ref MemBase); void AVX128_RestoreAVXState(Ref MemBase); void AVX128_DefaultAVXState(); void AVX128_VPERM2(OpcodeArgs); void AVX128_VTESTP(OpcodeArgs, IR::OpSize ElementSize); void AVX128_PTest(OpcodeArgs); void AVX128_VPERMILReg(OpcodeArgs, IR::OpSize ElementSize); void AVX128_VPERMD(OpcodeArgs); void AVX128_VPCLMULQDQ(OpcodeArgs); void AVX128_VFMAImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx); void AVX128_VFMAScalarImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx); void AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx); RefPair AVX128_VPGatherQPSImpl(OpcodeArgs, Ref Dest, Ref Mask, RefVSIB VSIB); RefPair AVX128_VPGatherImpl(OpcodeArgs, OpSize Size, OpSize ElementLoadSize, OpSize AddrElementSize, RefPair Dest, RefPair Mask, RefVSIB VSIB); void AVX128_VPGATHER(OpcodeArgs, OpSize AddrElementSize); void AVX128_VCVTPH2PS(OpcodeArgs); void AVX128_VCVTPS2PH(OpcodeArgs); // End of AVX 128-bit implementation // AVX 256-bit operations void StoreResult_WithAVXInsert(VectorOpType Type, RegClass Class, FEXCore::X86Tables::DecodedOp Op, Ref Value, IR::OpSize Align = IR::OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { if (Op->Dest.IsGPR() && Op->Dest.Data.GPR.GPR >= X86State::REG_XMM_0 && Op->Dest.Data.GPR.GPR <= X86State::REG_XMM_15 && GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) { const auto gpr = Op->Dest.Data.GPR.GPR; const auto gprIndex = gpr - X86State::REG_XMM_0; auto DestVector = LoadXMMRegister(gprIndex); Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value); StoreXMMRegister(gprIndex, Value); return; } StoreResult(Class, Op, Value, Align, AccessType); } void StoreXMMRegister_WithAVXInsert(VectorOpType Type, uint32_t XMM, Ref Value) { if (GetGuestVectorLength() == OpSize::i256Bit && Type == VectorOpType::SSE) { ///< SSE vector stores need to insert in the low 128-bit lane of the 256-bit register. auto DestVector = LoadXMMRegister(XMM); Value = _VInsElement(GetGuestVectorLength(), OpSize::i128Bit, 0, 0, DestVector, Value); StoreXMMRegister(XMM, Value); return; } StoreXMMRegister(XMM, Value); } void AVXVectorALUOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVXVectorUnaryOp(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); void AVXVectorVariableBlend(OpcodeArgs, IR::OpSize ElementSize); // End of AVX 256-bit implementation void InvalidOp(OpcodeArgs); void NoExecOp(OpcodeArgs); void SetPackedRFLAG(bool Lower8, Ref Src); Ref GetPackedRFLAG(uint32_t FlagsMask = ~0U); void SetMultiblock(bool _Multiblock) { Multiblock = _Multiblock; } static inline constexpr unsigned IndexNZCV(unsigned BitOffset) { switch (BitOffset) { case FEXCore::X86State::RFLAG_OF_RAW_LOC: return 28; case FEXCore::X86State::RFLAG_CF_RAW_LOC: return 29; case FEXCore::X86State::RFLAG_ZF_RAW_LOC: return 30; case FEXCore::X86State::RFLAG_SF_RAW_LOC: return 31; default: FEX_UNREACHABLE; } } void StoreContextHelper(IR::OpSize Size, RegClass Class, Ref Value, uint32_t Offset) { // For i128Bit, we won't see a normal Constant to inline, but as a special // case we can replace with a 2x64-bit store which can use inline zeroes. if (Size == OpSize::i128Bit) { auto Header = GetOpHeader(WrapNode(Value)); const auto MAX_STP_OFFSET = (252 * 4); if (Offset <= MAX_STP_OFFSET && Header->Op == OP_LOADNAMEDVECTORCONSTANT) { auto Const = Header->C(); if (Const->Constant == IR::NamedVectorConstant::NAMED_VECTOR_ZERO) { Ref Zero = _Constant(0); Ref STP = _StoreContextPair(IR::OpSize::i64Bit, RegClass::GPR, Zero, Zero, Offset); // XXX: This works around InlineConstant not having an associated // register class, else we'd just do InlineConstant above. Ref InlineZero = _InlineConstant(0); ReplaceNodeArgument(STP, 0, InlineZero); ReplaceNodeArgument(STP, 1, InlineZero); return; } } } _StoreContext(Size, Class, Value, Offset); } void FlushRegisterCache(bool SRAOnly = false, bool MMXOnly = false) { // At block boundaries, fix up the carry flag. if (!SRAOnly) { RectifyCarryInvert(CFInvertedABI); } if (!MMXOnly) { CalculateDeferredFlags(); } const auto GPRSize = GetGPROpSize(); const auto VectorSize = GetGuestVectorLength(); // Write backwards. This is a heuristic to improve coalescing, since we // often copy from (low) fixed GPRs to (high) PF/AF for celebrity // instructions like "add rax, 1". This hack will go away with clauses. uint64_t Bits = RegCache.Written; // We have an SRA only mode that exists as a hack to make register caching // less aggressive. We should get rid of this once RA can take it. uint64_t Mask = ~0ULL; if (SRAOnly) { const uint64_t GPRMask = ((1ull << (AFIndex - GPR0Index + 1)) - 1) << GPR0Index; const uint64_t FPRMask = ((1ull << (FPR15Index - FPR0Index + 1)) - 1) << FPR0Index; Mask &= (GPRMask | FPRMask); Bits &= Mask; } if (MMXOnly) { Mask &= ((1ull << (MM7Index - MM0Index + 1)) - 1) << MM0Index; Bits &= Mask; } while (Bits != 0) { uint32_t Index = 63 - std::countl_zero(Bits); Ref Value = RegCache.Value[Index]; if (Index >= GPR0Index && Index <= GPR15Index) { Ref R = _StoreRegister(Value, GPRSize); R->Reg = PhysicalRegister(RegClass::GPRFixed, Index - GPR0Index).Raw; } else if (Index == PFIndex) { _StorePF(Value, GPRSize); } else if (Index == AFIndex) { _StoreAF(Value, GPRSize); } else if (Index >= FPR0Index && Index <= FPR15Index) { Ref R = _StoreRegister(Value, VectorSize); R->Reg = PhysicalRegister(RegClass::FPRFixed, Index - FPR0Index).Raw; } else if (Index == DFIndex) { _StoreContextGPR(OpSize::i8Bit, Value, offsetof(Core::CPUState, flags[X86State::RFLAG_DF_RAW_LOC])); } else { bool Partial = RegCache.Partial & (1ull << Index); auto Size = Partial ? OpSize::i64Bit : CacheIndexToOpSize(Index); uint64_t NextBit = (1ull << (Index - 1)); uint32_t Offset = CacheIndexToContextOffset(Index); auto Class = CacheIndexClass(Index); LOGMAN_THROW_A_FMT(Offset != ~0U, "Invalid offset"); // Use stp where possible to store multiple values at a time. This accelerates AVX. // TODO: this is all really confusing because of backwards iteration, // can we peel back that hack? const auto SizeInt = IR::OpSizeToSize(Size); if ((Bits & NextBit) && !Partial && Size >= OpSize::i32Bit && CacheIndexToContextOffset(Index - 1) == Offset - SizeInt && (Offset - SizeInt) / SizeInt < 64) { LOGMAN_THROW_A_FMT(CacheIndexClass(Index - 1) == Class, "construction"); LOGMAN_THROW_A_FMT((Offset % SizeInt) == 0, "construction"); Ref ValueNext = RegCache.Value[Index - 1]; _StoreContextPair(Size, Class, ValueNext, Value, Offset - SizeInt); Bits &= ~NextBit; } else { StoreContextHelper(Size, Class, Value, Offset); // If Partial and MMX register, then we need to store all 1s in bits 64-80 if (Partial && Index >= MM0Index && Index <= MM7Index) { _StoreContextGPR(OpSize::i16Bit, Constant(0xFFFF), Offset + 8); } } } Bits &= ~(1ull << Index); } RegCache.Written &= ~Mask; RegCache.Cached &= ~Mask; RegCache.Partial &= ~Mask; } IR::OpSize GetGPROpSize() const { return Is64BitMode ? IR::OpSize::i64Bit : IR::OpSize::i32Bit; } protected: void RecordX87Use() override { CurrentHeader->HasX87 = true; } void SaveNZCV(IROps Op = OP_DUMMY) override { /* Some opcodes are conservatively marked as clobbering flags, but in fact * do not clobber flags in certain conditions. Check for that here as an * optimization. */ switch (Op) { case OP_VFMINSCALARINSERT: case OP_VFMAXSCALARINSERT: /* On AFP platforms, becomes fmin/fmax and preserves NZCV. Otherwise * becomes fcmp and clobbers. */ if (CTX->HostFeatures.SupportsAFP) { return; } break; case OP_VLOADVECTORMASKED: case OP_VLOADVECTORGATHERMASKED: case OP_VLOADVECTORGATHERMASKEDQPS: case OP_VSTOREVECTORMASKED: /* On ASIMD platforms, the emulation happens to preserve NZCV, unlike the * more optimal SVE implementation that clobbers. */ if (!CTX->HostFeatures.SupportsSVE128 && !CTX->HostFeatures.SupportsSVE256) { return; } break; default: break; } // Invariant: When executing instructions that clobber NZCV, the flags must // be resident in a GPR, which is equivalent to CachedNZCV != nullptr. Get // the NZCV which fills the cache if necessary. if (CachedNZCV == nullptr) { GetNZCV(); } // Assume we'll need a reload. NZCVDirty = true; } private: FEX_CONFIG_OPT(ReducedPrecisionMode, X87REDUCEDPRECISION); struct JumpTargetInfo { Ref BlockEntry; bool HaveEmitted; bool IsEntryPoint; }; FEXCore::Context::ContextImpl* CTX {}; constexpr static unsigned FullNZCVMask = (1U << FEXCore::X86State::RFLAG_CF_RAW_LOC) | (1U << FEXCore::X86State::RFLAG_ZF_RAW_LOC) | (1U << FEXCore::X86State::RFLAG_SF_RAW_LOC) | (1U << FEXCore::X86State::RFLAG_OF_RAW_LOC); static bool ContainsNZCV(unsigned BitMask) { return (BitMask & FullNZCVMask) != 0; } static bool IsNZCV(unsigned BitOffset) { return BitOffset < 32 && ContainsNZCV(1U << BitOffset); } Ref CachedNZCV {}; bool NZCVDirty {}; // Set if the host carry is inverted from the guest carry. This is set after // subtraction, because arm64 and x86 have inverted borrow flags, but clear // after addition. // // All CF access needs to maintain this flag. cfinv may be inserted at the end // of a block to rectify to the FEX convention (current convention: NOT // INVERTED). bool CFInverted {}; // FEX convention for CF at the end of blocks: INVERTED. const bool CFInvertedABI {true}; fextl::map JumpTargets; bool HandledLock {false}; bool DecodeFailure {false}; bool NeedsBlockEnd {false}; ForceTSOMode ForceTSO {ForceTSOMode::NoOverride}; // Used during new op bringup bool ShouldDump {false}; using SaveStoreAVXStatePtr = void (OpDispatchBuilder::*)(Ref MemBase); using DefaultAVXStatePtr = void (OpDispatchBuilder::*)(); SaveStoreAVXStatePtr SaveAVXStateFunc {&OpDispatchBuilder::SaveAVXState}; SaveStoreAVXStatePtr RestoreAVXStateFunc {&OpDispatchBuilder::RestoreAVXState}; DefaultAVXStatePtr DefaultAVXStateFunc {&OpDispatchBuilder::DefaultAVXState}; // Opcode helpers for generalizing behavior across VEX and non-VEX variants. Ref ADDSUBPOpImpl(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2); void AVXVariableShiftImpl(OpcodeArgs, IROps IROp); Ref AESKeyGenAssistImpl(OpcodeArgs); Ref CVTGPR_To_FPRImpl(OpcodeArgs, IR::OpSize DstElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op); Ref DPPOpImpl(IR::OpSize DstSize, Ref Src1, Ref Src2, uint8_t Mask, IR::OpSize ElementSize); Ref VDPPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm); Ref ExtendVectorElementsImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DstElementSize, bool Signed); Ref HSUBPOpImpl(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2); Ref InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm); Ref MPSADBWOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, uint8_t Select); Ref PALIGNROpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1, const X86Tables::DecodedOperand& Src2, const X86Tables::DecodedOperand& Imm, bool IsAVX); void PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask); Ref PHADDSOpImpl(OpSize Size, Ref Src1, Ref Src2); Ref PHMINPOSUWOpImpl(OpcodeArgs); Ref PHSUBOpImpl(OpSize Size, Ref Src1, Ref Src2, IR::OpSize ElementSize); Ref PHSUBSOpImpl(OpSize Size, Ref Src1, Ref Src2); Ref PINSROpImpl(OpcodeArgs, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, const X86Tables::DecodedOperand& Imm); Ref PMADDWDOpImpl(IR::OpSize Size, Ref Src1, Ref Src2); Ref PMADDUBSWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2); Ref PMULHRSWOpImpl(OpSize Size, Ref Src1, Ref Src2); Ref PMULHWOpImpl(OpcodeArgs, bool Signed, Ref Src1, Ref Src2); Ref PMULLOpImpl(OpSize Size, IR::OpSize ElementSize, bool Signed, Ref Src1, Ref Src2); Ref PSADBWOpImpl(IR::OpSize Size, Ref Src1, Ref Src2); Ref GeneratePSHUFBMask(IR::OpSize SrcSize); Ref PSHUFBOpImpl(IR::OpSize SrcSize, Ref Src1, Ref Src2, Ref MaskVector); Ref PSIGNImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src1, Ref Src2); Ref PSLLIImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, uint64_t Shift); Ref PSLLImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, Ref ShiftVec); Ref PSRAOpImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, Ref ShiftVec); Ref PSRLDOpImpl(OpcodeArgs, IR::OpSize ElementSize, Ref Src, Ref ShiftVec); Ref SHUFOpImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t Shuffle); void VMASKMOVOpImpl(OpcodeArgs, IR::OpSize ElementSize, IR::OpSize DataSize, bool IsStore, const X86Tables::DecodedOperand& MaskOp, const X86Tables::DecodedOperand& DataOp); void MOVScalarOpImpl(OpcodeArgs, IR::OpSize ElementSize); void VMOVScalarOpImpl(OpcodeArgs, IR::OpSize ElementSize); Ref VFCMPOpImpl(OpSize Size, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t CompType); void VTESTOpImpl(OpSize SrcSize, IR::OpSize ElementSize, Ref Src1, Ref Src2); void VectorUnaryDuplicateOpImpl(OpcodeArgs, IROps IROp, IR::OpSize ElementSize); // x86 ALU scalar operations operate in three different ways // - AVX512: Writemask shenanigans that we don't care about. // - AVX/VEX: Two source // - Example 32bit VADDSS Dest, Src1, Src2 // - Dest[31:0] = Src1[31:0] + Src2[31:0] // - Dest[127:32] = Src1[127:32] // - SSE: Scalar operation inserts in to the low bits, upper bits completely unaffected. // - Example 32bit ADDSS Dest, Src // - Dest[31:0] = Dest[31:0] + Src[31:0] // - Dest[{256,128}:32] = (Unmodified) Ref VectorScalarInsertALUOpImpl(OpcodeArgs, IROps IROp, IR::OpSize DstSize, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits); Ref VectorScalarUnaryInsertALUOpImpl(OpcodeArgs, IROps IROp, IR::OpSize DstSize, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits); Ref InsertCVTGPR_To_FPRImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize DstElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits); Ref InsertScalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, bool ZeroUpperBits); Ref InsertScalarRoundImpl(OpcodeArgs, IR::OpSize DstSize, IR::OpSize ElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op, uint64_t Mode, bool ZeroUpperBits); Ref InsertScalarFCMPOpImpl(OpSize Size, IR::OpSize OpDstSize, IR::OpSize ElementSize, Ref Src1, Ref Src2, uint8_t CompType, bool ZeroUpperBits); Ref VectorRoundImpl(OpSize Size, IR::OpSize ElementSize, Ref Src, uint64_t Mode); Ref Scalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op); Ref CVTFPR_To_GPRImpl(OpcodeArgs, Ref Src, IR::OpSize SrcElementSize, bool HostRoundingMode); Ref Vector_CVT_Float_To_Int32Impl(OpcodeArgs, IR::OpSize DstSize, Ref Src, IR::OpSize SrcSize, IR::OpSize SrcElementSize, bool HostRoundingMode, bool ZeroUpperHalf); Ref Vector_CVT_Int_To_FloatImpl(OpcodeArgs, IR::OpSize SrcElementSize, bool Widen); void XSaveOpImpl(OpcodeArgs); void SaveX87State(OpcodeArgs, Ref MemBase); void SaveSSEState(Ref MemBase); void SaveMXCSRState(Ref MemBase); void SaveAVXState(Ref MemBase); void XRstorOpImpl(OpcodeArgs); void RestoreX87State(Ref MemBase); void RestoreSSEState(Ref MemBase); void RestoreMXCSRState(Ref MXCSR); void RestoreAVXState(Ref MemBase); void DefaultX87State(OpcodeArgs); void DefaultSSEState(); void DefaultAVXState(); Ref GetMXCSR(); #undef OpcodeArgs Ref AppendSegmentOffset(Ref Value, uint32_t Flags, uint32_t DefaultPrefix = 0, bool Override = false); Ref GetSegment(uint32_t Flags, uint32_t DefaultPrefix = FEXCore::X86Tables::DecodeFlags::FLAG_NO_PREFIX, bool Override = false); void UpdatePrefixFromSegment(Ref Segment, uint32_t SegmentReg); Ref LoadGPRRegister(uint32_t GPR, IR::OpSize Size = OpSize::iInvalid, uint8_t Offset = 0, bool AllowUpperGarbage = false); void StoreGPRRegister(uint32_t GPR, const Ref Src, IR::OpSize Size = OpSize::iInvalid, uint8_t Offset = 0); void StoreXMMRegister(uint32_t XMM, const Ref Src); Ref _GetRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset, bool Inline) { const auto GPRSize = GetGPROpSize(); const auto Offs = Op->PC + Op->InstSize + Offset - Entry; return Inline ? _InlineEntrypointOffset(GPRSize, Offs) : _EntrypointOffset(GPRSize, Offs); } Ref GetRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset = 0) { return _GetRelocatedPC(Op, Offset, false); } void ExitRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset = 0) { ExitFunction(_GetRelocatedPC(Op, Offset, true /* Inline */)); } void ExitRelocatedPC(const FEXCore::X86Tables::DecodedOp& Op, int64_t Offset, BranchHint Hint, Ref CallReturnAddress, Ref CallReturnBlock) { ExitFunction(_GetRelocatedPC(Op, Offset, true /* Inline */), Hint, CallReturnAddress, CallReturnBlock); } [[nodiscard]] static bool IsOperandMem(const X86Tables::DecodedOperand& Operand, bool Load) { // Literals are immediates as sources but memory addresses as destinations. return !(Load && (Operand.IsLiteral() || Operand.IsLiteralRelocation())) && !Operand.IsGPR(); } [[nodiscard]] static bool IsNonTSOReg(MemoryAccessType Access, uint8_t Reg) { return Access == MemoryAccessType::DEFAULT && Reg == X86State::REG_RSP; } AddressMode DecodeAddress(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, MemoryAccessType AccessType, bool IsLoad); Ref LoadSource(RegClass Class, const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, const LoadSourceOptions& Options = {}); Ref LoadSourceGPR(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, const LoadSourceOptions& Options = {}) { return LoadSource(RegClass::GPR, Op, Operand, Flags, Options); } Ref LoadSourceFPR(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, const LoadSourceOptions& Options = {}) { return LoadSource(RegClass::FPR, Op, Operand, Flags, Options); } Ref LoadSource_WithOpSize(RegClass Class, const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, IR::OpSize OpSize, uint32_t Flags, const LoadSourceOptions& Options = {}); Ref LoadSourceGPR_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, IR::OpSize OpSize, uint32_t Flags, const LoadSourceOptions& Options = {}) { return LoadSource_WithOpSize(RegClass::GPR, Op, Operand, OpSize, Flags, Options); } Ref LoadSourceFPR_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, IR::OpSize OpSize, uint32_t Flags, const LoadSourceOptions& Options = {}) { return LoadSource_WithOpSize(RegClass::FPR, Op, Operand, OpSize, Flags, Options); } void StoreResult_WithOpSize(RegClass Class, X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, IR::OpSize OpSize, IR::OpSize Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT); void StoreResultGPR_WithOpSize(X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, IR::OpSize OpSize, IR::OpSize Align = IR::OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { StoreResult_WithOpSize(RegClass::GPR, Op, Operand, Src, OpSize, Align, AccessType); } void StoreResultFPR_WithOpSize(X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, IR::OpSize OpSize, IR::OpSize Align = IR::OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { StoreResult_WithOpSize(RegClass::FPR, Op, Operand, Src, OpSize, Align, AccessType); } void StoreResult(RegClass Class, X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, OpSize Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT); void StoreResultGPR(X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, OpSize Align = OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { StoreResult(RegClass::GPR, Op, Operand, Src, Align, AccessType); } void StoreResultFPR(X86Tables::DecodedOp Op, const X86Tables::DecodedOperand& Operand, Ref Src, OpSize Align = OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { StoreResult(RegClass::FPR, Op, Operand, Src, Align, AccessType); } void StoreResult(RegClass Class, X86Tables::DecodedOp Op, Ref Src, OpSize Align, MemoryAccessType AccessType = MemoryAccessType::DEFAULT); void StoreResultGPR(X86Tables::DecodedOp Op, Ref Src, OpSize Align = OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { StoreResult(RegClass::GPR, Op, Src, Align, AccessType); } void StoreResultFPR(X86Tables::DecodedOp Op, Ref Src, OpSize Align = OpSize::iInvalid, MemoryAccessType AccessType = MemoryAccessType::DEFAULT) { StoreResult(RegClass::FPR, Op, Src, Align, AccessType); } // In several instances, it's desirable to get a base address with the segment offset // applied to it. This pulls all the common-case appending into a single set of functions. [[nodiscard]] Ref MakeSegmentAddress(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, IR::OpSize OpSize) { Ref Mem = LoadSourceGPR_WithOpSize(Op, Operand, OpSize, Op->Flags, {.LoadData = false}); return AppendSegmentOffset(Mem, Op->Flags); } [[nodiscard]] Ref MakeSegmentAddress(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand) { return MakeSegmentAddress(Op, Operand, OpSizeFromSrc(Op)); } [[nodiscard]] Ref MakeSegmentAddress(X86State::X86Reg Reg, uint32_t Flags, uint32_t DefaultPrefix = 0, bool Override = false) { Ref Address = LoadGPRRegister(Reg); return AppendSegmentOffset(Address, Flags, DefaultPrefix, Override); } constexpr OpSize GetGuestVectorLength() const { return (CTX->HostFeatures.SupportsSVE256 && CTX->HostFeatures.SupportsAVX) ? OpSize::i256Bit : OpSize::i128Bit; } [[nodiscard]] static uint32_t GPROffset(X86State::X86Reg reg) { LOGMAN_THROW_A_FMT(reg <= X86State::X86Reg::REG_R15, "Invalid reg used"); return static_cast(ARRAY_OFFSETOF(Core::CPUState, gregs, reg)); } [[nodiscard]] static uint32_t MMBaseOffset() { return static_cast(offsetof(Core::CPUState, mm[0][0])); } [[nodiscard]] uint8_t GetDstSize(X86Tables::DecodedOp Op) const; [[nodiscard]] uint8_t GetSrcSize(X86Tables::DecodedOp Op) const; [[nodiscard]] uint32_t GetDstBitSize(X86Tables::DecodedOp Op) const; [[nodiscard]] uint32_t GetSrcBitSize(X86Tables::DecodedOp Op) const; [[nodiscard]] IR::OpSize OpSizeFromDst(X86Tables::DecodedOp Op) const { return IR::SizeToOpSize(GetDstSize(Op)); } [[nodiscard]] IR::OpSize OpSizeFromSrc(X86Tables::DecodedOp Op) const { return IR::SizeToOpSize(GetSrcSize(Op)); } [[nodiscard]] IR::OpSize GetStringOpSize(X86Tables::DecodedOp Op) const; // Set flag tracking to prepare for an operation that directly writes NZCV. void HandleNZCVWrite() { CachedNZCV = nullptr; NZCVDirty = false; } // Set flag tracking to prepare for a read-modify-write operation on NZCV. void HandleNZCV_RMW() { CalculateDeferredFlags(); if (NZCVDirty && CachedNZCV) { _StoreNZCV(CachedNZCV); } HandleNZCVWrite(); } // Special case of the above where we are known to zero C/V void HandleNZ00Write() { HandleNZCVWrite(); // Host carry will be implicitly zeroed, and we want guest carry zeroed as // well. So do not invert. CFInverted = false; } Ref GetNZCV() { if (!CachedNZCV) { CachedNZCV = _LoadNZCV(); } return CachedNZCV; } void SetNZCV(Ref Value) { CachedNZCV = Value; NZCVDirty = true; } void ZeroNZCV() { CachedNZCV = Constant(0); NZCVDirty = true; } void SetNZ_ZeroCV(IR::OpSize SrcSize, Ref Res, bool SetPF = false) { HandleNZ00Write(); // x - 0 = x. NZ set according to Res. C always set. V always unset. This // matches what we want since we want carry inverted. // // This is currently worse for 8/16-bit, but that should be optimized. TODO if (SrcSize >= OpSize::i32Bit) { if (SetPF) { CalculatePF(SubWithFlags(SrcSize, Res, (uint64_t)0)); } else { _SubNZCV(SrcSize, Res, Constant(0)); } CFInverted = true; } else { _TestNZ(SrcSize, Res, Res); CFInverted = false; if (SetPF) { CalculatePF(Res); } } } void SetNZP_ZeroCV(IR::OpSize SrcSize, Ref Res) { SetNZ_ZeroCV(SrcSize, Res, true); } void InsertNZCV(unsigned BitOffset, Ref Value, signed FlagOffset, bool MustMask) { signed Bit = IndexNZCV(BitOffset); // Heuristic to choose rmif vs msr. bool PreferRmif = !NZCVDirty || FlagOffset || MustMask; if (CTX->HostFeatures.SupportsFlagM && PreferRmif) { // Update NZCV if (NZCVDirty && CachedNZCV) { _StoreNZCV(CachedNZCV); } CachedNZCV = nullptr; NZCVDirty = false; // Insert as NZCV. signed RmifBit = Bit - 28; _RmifNZCV(Value, (64 + FlagOffset - RmifBit) % 64, 1u << RmifBit); CachedNZCV = nullptr; } else { // Insert as GPR if (FlagOffset || MustMask) { Value = _Bfe(OpSize::i64Bit, 1, FlagOffset, Value); } SetNZCV(_Bfi(OpSize::i32Bit, 1, Bit, GetNZCV(), Value)); } } // If we don't care about N/C/V and just need Z, we can test with a simple // mask without any shifting. void SetZ_InvalidateNCV(IR::OpSize Size, Ref Src) { HandleNZCVWrite(); CFInverted = true; if (Size < OpSize::i32Bit) { _TestNZ(OpSize::i32Bit, Src, _InlineConstant((1u << (IR::OpSizeAsBits(Size))) - 1)); } else { _TestNZ(Size, Src, Src); } } // Ensure the carry invert flag matches the desired form. Used before an // operation reading carry or at the end of a block. void RectifyCarryInvert(bool RequiredInvert) { if (CFInverted != RequiredInvert) { if (CTX->HostFeatures.SupportsFlagM && !NZCVDirty) { // Invert as NZCV. _CarryInvert(); CachedNZCV = nullptr; } else { // Invert as a GPR unsigned Bit = IndexNZCV(FEXCore::X86State::RFLAG_CF_RAW_LOC); SetNZCV(_Xor(OpSize::i32Bit, GetNZCV(), Constant(1u << Bit))); CalculateDeferredFlags(); } CFInverted ^= true; } LOGMAN_THROW_A_FMT(CFInverted == RequiredInvert, "post condition"); } void CarryInvert() { CFInverted ^= true; } template void SetRFLAG(Ref Value, unsigned ValueOffset = 0, bool MustMask = false) { SetRFLAG(Value, BitOffset, ValueOffset, MustMask); } void SetCFDirect(Ref Value, unsigned ValueOffset = 0, bool MustMask = false) { Value = _Xor(OpSize::i64Bit, Value, _InlineConstant(1ull << ValueOffset)); SetRFLAG(Value, X86State::RFLAG_CF_RAW_LOC, ValueOffset, MustMask); CFInverted = true; } // Set CF directly to the given 0/1 value. This needs to respect the // invert. We use a subtraction: // // 0 - x = 0 + (~x) + 1. // // If x = 0, then 0 + (~0) + 1 = 0x100000000 so hardware C is set. // If x = 1, then 0 + (~1) + 1 = 0x0ffffffff so hardware C is not set. void SetCFDirect_InvalidateNZV(Ref Value, unsigned ValueOffset = 0, bool MustMask = false) { if (ValueOffset || MustMask) { Value = _Bfe(OpSize::i64Bit, 1, ValueOffset, Value); } HandleNZCVWrite(); _SubNZCV(OpSize::i32Bit, Constant(0), Value); CFInverted = true; } void SetCFInverted(Ref Value, unsigned ValueOffset = 0, bool MustMask = false) { SetRFLAG(Value, X86State::RFLAG_CF_RAW_LOC, ValueOffset, MustMask); CFInverted = true; } void SetRFLAG(Ref Value, unsigned BitOffset, unsigned ValueOffset = 0, bool MustMask = false) { if (IsNZCV(BitOffset)) { InsertNZCV(BitOffset, Value, ValueOffset, MustMask); return; } if (ValueOffset || MustMask) { Value = _Bfe(OpSize::i32Bit, 1, ValueOffset, Value); } if (BitOffset == FEXCore::X86State::RFLAG_PF_RAW_LOC) { StoreRegister(Core::CPUState::PF_AS_GREG, false, Value); } else if (BitOffset == FEXCore::X86State::RFLAG_AF_RAW_LOC) { StoreRegister(Core::CPUState::AF_AS_GREG, false, Value); } else if (BitOffset == FEXCore::X86State::RFLAG_DF_RAW_LOC) { // For DF, we need to transform 0/1 into 1/-1 StoreDF(_SubShift(OpSize::i64Bit, Constant(1), Value, ShiftType::LSL, 1)); } else if (BitOffset == FEXCore::X86State::RFLAG_TF_RAW_LOC) { auto PackedTF = _LoadContextGPR(OpSize::i8Bit, ARRAY_OFFSETOF(FEXCore::Core::CPUState, flags, BitOffset)); // An exception should still be raised after an instruction that unsets TF, leave the unblocked bit set but unset // the TF bit to cause such behaviour. The handling code at the start of the next block will then unset the // unblocked bit before raising the exception. auto NewPackedTF = _Select(OpSize::i64Bit, OpSize::i64Bit, CondClass::EQ, Value, Constant(0), _And(OpSize::i32Bit, PackedTF, Constant(~1)), Constant(1)); _StoreContextGPR(OpSize::i8Bit, NewPackedTF, ARRAY_OFFSETOF(FEXCore::Core::CPUState, flags, BitOffset)); } else { _StoreContextGPR(OpSize::i8Bit, Value, ARRAY_OFFSETOF(FEXCore::Core::CPUState, flags, BitOffset)); } } void SetAF(unsigned K) { // AF is stored in bit 4 of the AF flag byte, with garbage in the other // bits. This allows us to defer the extract in the usual case. When it is // read, bit 4 is extracted. In order to write a constant value of AF, that // means we need to left-shift here to compensate. SetRFLAG(Constant(K << 4)); } void ZeroPF_AF(); void InvalidateAF() { _InvalidateFlags((1u << X86State::RFLAG_AF_RAW_LOC)); InvalidateReg(Core::CPUState::AF_AS_GREG); } void InvalidatePF_AF() { _InvalidateFlags((1u << X86State::RFLAG_PF_RAW_LOC) | (1u << X86State::RFLAG_AF_RAW_LOC)); InvalidateReg(Core::CPUState::PF_AS_GREG); InvalidateReg(Core::CPUState::AF_AS_GREG); } [[nodiscard]] static CondClass CondForNZCVBit(unsigned BitOffset, bool Invert) { switch (BitOffset) { case X86State::RFLAG_SF_RAW_LOC: return Invert ? CondClass::PL : CondClass::MI; case X86State::RFLAG_ZF_RAW_LOC: return Invert ? CondClass::NEQ : CondClass::EQ; case X86State::RFLAG_CF_RAW_LOC: return Invert ? CondClass::ULT : CondClass::UGE; case X86State::RFLAG_OF_RAW_LOC: return Invert ? CondClass::FNU : CondClass::FU; default: FEX_UNREACHABLE; } } /* Layout of cache indices. We use a single 64-bit bitmask for the cache */ static const int GPR0Index = 0; static const int GPR15Index = 15; static const int PFIndex = 16; static const int AFIndex = 17; /* Gap 18..19 */ /* Note this range is only valid if MMXState = MMXState_MMX */ static const int MM0Index = 20; static const int MM7Index = 27; /* Gap 28..30 */ static const int DFIndex = 31; static const int FPR0Index = 32; static const int FPR15Index = 47; static const int AVXHigh0Index = 48; static const int AVXHigh15Index = 63; [[nodiscard]] static uint32_t CacheIndexToContextOffset(int Index) { switch (Index) { case MM0Index ... MM7Index: return ARRAY_OFFSETOF(FEXCore::Core::CPUState, mm, Index - MM0Index); case AVXHigh0Index ... AVXHigh15Index: return ARRAY_OFFSETOF(FEXCore::Core::CPUState, avx_high, Index - AVXHigh0Index); default: return ~0U; } } [[nodiscard]] static RegClass CacheIndexClass(int Index) { if ((Index >= MM0Index && Index <= MM7Index) || Index >= FPR0Index) { return RegClass::FPR; } else { return RegClass::GPR; } } [[nodiscard]] static IR::OpSize CacheIndexToOpSize(int Index) { // MMX registers are rounded up to 128-bit since they are shared with 80-bit // x87 registers, even though MMX is logically only 64-bit. if (Index >= AVXHigh0Index || ((Index >= MM0Index && Index <= MM7Index))) { return OpSize::i128Bit; } else { return OpSize::i8Bit; } } struct { uint64_t Cached; uint64_t Written; // Indicates that Value contains only the lower 64-bit of the full 80-bit // register. Used for MMX/x87 optimization. uint64_t Partial; Ref Value[64]; } RegCache {}; void InvalidateReg(uint8_t Index) { uint64_t Bit = (1ull << (uint64_t)Index); RegCache.Cached &= ~Bit; RegCache.Written &= ~Bit; } Ref LoadRegCache(uint64_t Offset, uint8_t Index, RegClass Class, IR::OpSize Size) { LOGMAN_THROW_A_FMT(Index < 64, "valid index"); uint64_t Bit = (1ull << (uint64_t)Index); if (Size == OpSize::i128Bit && (RegCache.Partial & Bit)) { // We need to load the full register extend if we previously did a partial access. Ref Value = RegCache.Value[Index]; Ref Full = _LoadContext(Size, Class, Offset); // If we did a partial store, we're inserting into the full register if (RegCache.Written & Bit) { Full = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 0, 0, Full, Value); } RegCache.Value[Index] = Full; } if (!(RegCache.Cached & Bit)) { if (Index == DFIndex) { RegCache.Value[Index] = _LoadDF(); } else if ((Index >= MM0Index && Index <= MM7Index) || Index >= AVXHigh0Index) { RegCache.Value[Index] = _LoadContext(Size, Class, Offset); // We may have done a partial load, this requires special handling. if (Size == OpSize::i64Bit) { RegCache.Partial |= Bit; } } else if (Index == PFIndex) { RegCache.Value[Index] = _LoadPF(Size); } else if (Index == AFIndex) { RegCache.Value[Index] = _LoadAF(Size); } else { RegCache.Value[Index] = _LoadRegister(Offset, Class, Size); } RegCache.Cached |= Bit; } return RegCache.Value[Index]; } RefPair AllocatePair(RegClass Class, IR::OpSize Size) { if (Class == RegClass::FPR) { return {_AllocateFPR(Size, Size), _AllocateFPR(Size, Size)}; } else { return {_AllocateGPR(false), _AllocateGPR(false)}; } } RefPair LoadContextPair_Uncached(RegClass Class, IR::OpSize Size, unsigned Offset) { RefPair Values = AllocatePair(Class, Size); _LoadContextPair(Size, Class, Offset, Values.Low, Values.High); return Values; } RefPair LoadRegCachePair(uint64_t Offset, uint8_t Index, RegClass Class, IR::OpSize Size) { LOGMAN_THROW_A_FMT(Index != DFIndex, "must be pairable"); LOGMAN_THROW_A_FMT(Size != IR::OpSize::iUnsized, "Invalid size!"); // Try to load a pair into the cache uint64_t Bits = (3ull << (uint64_t)Index); const auto SizeInt = IR::OpSizeToSize(Size); if (((RegCache.Partial | RegCache.Cached) & Bits) == 0 && ((Offset / SizeInt) < 64)) { auto Values = LoadContextPair_Uncached(Class, Size, Offset); RegCache.Value[Index] = Values.Low; RegCache.Value[Index + 1] = Values.High; RegCache.Cached |= Bits; if (Size == OpSize::i64Bit) { RegCache.Partial |= Bits; } return Values; } // Fallback on a pair of loads return { .Low = LoadRegCache(Offset, Index, Class, Size), .High = LoadRegCache(Offset + SizeInt, Index + 1, Class, Size), }; } Ref LoadGPR(uint8_t Reg) { return LoadRegCache(Reg, GPR0Index + Reg, RegClass::GPR, GetGPROpSize()); } Ref LoadContext(IR::OpSize Size, uint8_t Index) { return LoadRegCache(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size); } RefPair LoadContextPair(IR::OpSize Size, uint8_t Index) { return LoadRegCachePair(CacheIndexToContextOffset(Index), Index, CacheIndexClass(Index), Size); } Ref LoadContext(uint8_t Index) { return LoadContext(CacheIndexToOpSize(Index), Index); } Ref LoadXMMRegister(uint8_t Reg) { return LoadRegCache(Reg, FPR0Index + Reg, RegClass::FPR, GetGuestVectorLength()); } Ref LoadDF() { return LoadGPR(DFIndex); } void StoreContext(uint8_t Index, Ref Value) { LOGMAN_THROW_A_FMT(Index < 64, "valid index"); LOGMAN_THROW_A_FMT(Value != InvalidNode, "storing valid"); uint64_t Bit = (1ull << (uint64_t)Index); RegCache.Value[Index] = Value; RegCache.Cached |= Bit; RegCache.Written |= Bit; } void InvalidateHighAVXRegisters() { for (size_t i = 0; i < 16; ++i) { InvalidateReg(AVXHigh0Index + i); } } void StoreRegister(uint8_t Reg, bool FPR, Ref Value) { StoreContext(Reg + (FPR ? FPR0Index : GPR0Index), Value); } void StoreDF(Ref Value) { StoreContext(DFIndex, Value); } Ref GetRFLAG(unsigned BitOffset, bool Invert = false) { if (IsNZCV(BitOffset)) { // Handle the CFInverted state internally so GetRFLAG is safe regardless // of the invert state. This simplifies the call sites. if (BitOffset == X86State::RFLAG_CF_RAW_LOC) { Invert ^= CFInverted; } if (NZCVDirty) { auto Value = _Bfe(OpSize::i32Bit, 1, IndexNZCV(BitOffset), GetNZCV()); if (Invert) { return _Xor(OpSize::i32Bit, Value, Constant(1)); } else { return Value; } } else { // Because we explicitly inverted for CF above, we use the unsafe // _NZCVSelect rather than the safe CF-aware version. return _NZCVSelect01(CondForNZCVBit(BitOffset, Invert)); } } else if (BitOffset == FEXCore::X86State::RFLAG_PF_RAW_LOC) { return LoadGPR(Core::CPUState::PF_AS_GREG); } else if (BitOffset == FEXCore::X86State::RFLAG_AF_RAW_LOC) { return LoadGPR(Core::CPUState::AF_AS_GREG); } else if (BitOffset == FEXCore::X86State::RFLAG_DF_RAW_LOC) { // Recover the sign bit, it is the logical DF value return _Lshr(OpSize::i64Bit, LoadDF(), Constant(63)); } else { return _LoadContextGPR(OpSize::i8Bit, ARRAY_OFFSETOF(Core::CPUState, flags, BitOffset)); } } // Returns (DF ? -Size : Size) Ref LoadDir(const unsigned Size) { return ARef(LoadDF()).Lshl(FEXCore::ilog2(Size)).Ref(); } // Returns DF ? (X - Size) : (X + Size) Ref OffsetByDir(Ref X, const unsigned Size) { auto Shift = FEXCore::ilog2(Size); return _AddShift(OpSize::i64Bit, X, LoadDF(), ShiftType::LSL, Shift); } // Safe version of NZCVSelect that handles inverted carries automatically. Ref NZCVSelect(OpSize OpSize, CondClass Cond, Ref TrueV, Ref FalseV, bool CarryIsInverted = false) { switch (Cond) { case CondClass::UGE: /* cs */ case CondClass::ULT: /* cc */ // Invert the condition to match our expectations. if (CarryIsInverted != CFInverted) { Cond = (Cond == CondClass::UGE) ? CondClass::ULT : CondClass::UGE; } break; case CondClass::UGT: /* hi */ case CondClass::ULE: /* ls */ // No clever optimization we can do here, rectify carry itself. RectifyCarryInvert(CarryIsInverted); break; default: // No other condition codes read carry so no need to rectify. break; } return _NZCVSelect(OpSize, Cond, TrueV, FalseV); } // Compares two floats and sets flags for a COMISS instruction void Comiss(IR::OpSize ElementSize, Ref Src1, Ref Src2, bool InvalidateAF = false) { // First, set flags according to Arm FCMP. HandleNZCVWrite(); _FCmp(ElementSize, Src1, Src2); CFInverted = false; ComissFlags(InvalidateAF); } // Sets flags for a COMISS instruction void ComissFlags(bool InvalidateAF = false) { LOGMAN_THROW_A_FMT(!NZCVDirty, "only expected after fcmp"); // We need to set PF according to the unordered flag. We'd rather do this // after axflag, since some impls fuse fcmp+axflag, so we want to do this // after. We can recover "unordered" after axflag as (Z && !C), but // there's no condition code for this so it would take 2 instructions // instead of one, which seems worse than doing 1 op before and breaking // the fusion. // // We set PF to unordered (V), but our PF representation is inverted so we // actually set to !V. This is one instruction with the VC cond code. Ref V_inv = GetRFLAG(FEXCore::X86State::RFLAG_OF_RAW_LOC, true); SetRFLAG(V_inv); if (!InvalidateAF) { // Zero AF. Note that the comparison sets the raw PF to 0/1 above, so // PF[4] is 0 so the XOR with PF will have no effect, so setting the AF // byte to zero will indeed zero AF as intended. SetRFLAG(Constant(0)); } // Convert NZCV from the Arm representation to an eXternal representation // that's totally not a euphemism for x86, nuh-uh. But maps to exactly we // need, what a coincidence! // // Our AXFlag emulation on FlagM2-less systems needs V_inv passed. _AXFlag(CTX->HostFeatures.SupportsFlagM2 ? Invalid() : V_inv); CFInverted = true; } // Set x87 comparison flags based on the result set by Arm FCMP. Clobbers // NZCV on flagm2 platforms. void ConvertNZCVToX87() { LOGMAN_THROW_A_FMT(NZCVDirty && CachedNZCV, "NZCV must be saved"); Ref V = _NZCVSelect01(CondForNZCVBit(FEXCore::X86State::RFLAG_OF_RAW_LOC, false)); if (CTX->HostFeatures.SupportsFlagM2) { // Convert to x86 flags, saves us from or'ing after. _AXFlag(Invalid()); } // CF is inverted after FCMP Ref C = _NZCVSelect01(CondForNZCVBit(FEXCore::X86State::RFLAG_CF_RAW_LOC, true)); Ref Z = _NZCVSelect01(CondForNZCVBit(FEXCore::X86State::RFLAG_ZF_RAW_LOC, false)); if (!CTX->HostFeatures.SupportsFlagM2) { C = _Or(OpSize::i32Bit, C, V); Z = _Or(OpSize::i32Bit, Z, V); } SetRFLAG(C); SetRFLAG(Constant(0)); SetRFLAG(V); SetRFLAG(Z); } // Helper to store a variable shift and calculate its flags for a variable // shift, with correct PF handling. void HandleShift(X86Tables::DecodedOp Op, Ref Result, Ref Dest, ShiftType Shift, Ref Src) { auto OldPF = GetRFLAG(X86State::RFLAG_PF_RAW_LOC); HandleNZCV_RMW(); CalculatePF(_ShiftFlags(OpSizeFromSrc(Op), Result, Dest, Shift, Src, OldPF, CFInverted)); StoreResultGPR(Op, Result); } // Helper to derive Dest by a given builder-using Expression with the opcode // replaced with NewOp. Useful for generic building code. Not safe in general. // but does the right handling of ImplicitFlagClobber at least and must be // used instead of raw Op mutation. #define DeriveOp(Dest, NewOp, Expr) \ if (ImplicitFlagClobber(NewOp)) SaveNZCV(NewOp); \ auto Dest = (Expr); \ Dest.first->Header.Op = (NewOp) // Named constant cache for the current block. // Different arrays for sizes 1,2,4,8,16,32. Ref CachedNamedVectorConstants[FEXCore::IR::NamedVectorConstant::NAMED_VECTOR_MAX][6] {}; struct IndexNamedVectorMapKey { uint32_t Index {}; FEXCore::IR::IndexNamedVectorConstant NamedIndexedConstant; uint8_t log2_size_in_bytes {}; uint16_t _pad {}; bool operator==(const IndexNamedVectorMapKey&) const = default; }; struct IndexNamedVectorMapKeyHasher { std::size_t operator()(const IndexNamedVectorMapKey& k) const noexcept { return XXH3_64bits(&k, sizeof(k)); } }; fextl::unordered_map CachedIndexedNamedVectorConstants; // Load and cache a named vector constant. Ref LoadAndCacheNamedVectorConstant(IR::OpSize Size, FEXCore::IR::NamedVectorConstant NamedConstant) { auto log2_size_bytes = FEXCore::ilog2(IR::OpSizeToSize(Size)); if (CachedNamedVectorConstants[NamedConstant][log2_size_bytes]) { return CachedNamedVectorConstants[NamedConstant][log2_size_bytes]; } auto K = _LoadNamedVectorConstant(Size, NamedConstant); CachedNamedVectorConstants[NamedConstant][log2_size_bytes] = K; return K; } Ref LoadAndCacheIndexedNamedVectorConstant(IR::OpSize Size, FEXCore::IR::IndexNamedVectorConstant NamedIndexedConstant, uint32_t Index) { IndexNamedVectorMapKey Key { .Index = Index, .NamedIndexedConstant = NamedIndexedConstant, .log2_size_in_bytes = FEXCore::ilog2(IR::OpSizeToSize(Size)), }; auto it = CachedIndexedNamedVectorConstants.find(Key); if (it != CachedIndexedNamedVectorConstants.end()) { return it->second; } auto K = _LoadNamedVectorIndexedConstant(Size, NamedIndexedConstant, Index); CachedIndexedNamedVectorConstants.insert_or_assign(Key, K); return K; } Ref LoadUncachedZeroVector(IR::OpSize Size) { return _LoadNamedVectorConstant(Size, IR::NamedVectorConstant::NAMED_VECTOR_ZERO); } Ref LoadZeroVector(IR::OpSize Size) { return LoadAndCacheNamedVectorConstant(Size, IR::NamedVectorConstant::NAMED_VECTOR_ZERO); } // Reset the named vector constants cache array. // These are only cached per block. void ClearCachedNamedConstants() { memset(CachedNamedVectorConstants, 0, sizeof(CachedNamedVectorConstants)); CachedIndexedNamedVectorConstants.clear(); } std::optional DecodeNZCVCondition(uint8_t OP); Ref SelectCC0All1(uint8_t OP); /** * @brief Flushes NZCV. Mostly vestigial. */ void CalculateDeferredFlags(); void ZeroShiftResult(FEXCore::X86Tables::DecodedOp Op) { // In the case of zero-rotate, we need to store the destination still to deal with 32-bit semantics. const auto Size = OpSizeFromSrc(Op); if (Size != OpSize::i32Bit) { return; } auto Dest = LoadSourceGPR(Op, Op->Dest, Op->Flags); StoreResultGPR(Op, Dest); } using ZeroShiftFunctionPtr = void (OpDispatchBuilder::*)(FEXCore::X86Tables::DecodedOp Op); template void Calculate_ShiftVariable(FEXCore::X86Tables::DecodedOp Op, Ref Shift, F&& Calculate, std::optional ZeroShiftResult = std::nullopt) { // RCR can call this with constants, so handle that without branching. uint64_t Const; if (IsValueConstant(WrapNode(Shift), &Const)) { if (Const) { Calculate(); } else if (ZeroShiftResult) { (this->*(*ZeroShiftResult))(Op); } return; } // Otherwise, prepare to branch. auto Zero = Constant(0); // If the shift is zero, do not touch the flags. auto SetBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); IRPair NextBlock = SetBlock; IRPair ZeroShiftBlock; if (ZeroShiftResult) { ZeroShiftBlock = CreateNewCodeBlockAfter(NextBlock); NextBlock = ZeroShiftBlock; } auto EndBlock = CreateNewCodeBlockAfter(NextBlock); ///< Jump to zeroshift block or end block depending on if it was provided. IRPair TailHandling = ZeroShiftResult ? ZeroShiftBlock : EndBlock; CondJump(Shift, Zero, TailHandling, SetBlock, CondClass::EQ); SetCurrentCodeBlock(SetBlock); StartNewBlock(); { Calculate(); Jump(EndBlock); } if (ZeroShiftResult) { SetCurrentCodeBlock(ZeroShiftBlock); StartNewBlock(); { (this->*(*ZeroShiftResult))(Op); Jump(EndBlock); } } SetCurrentCodeBlock(EndBlock); StartNewBlock(); } /** * @name These functions are used by the deferred flag handling while it is calculating and storing flags in to RFLAGs. * @{ */ Ref LoadPFRaw(bool Mask, bool Invert); Ref LoadAF(); void FixupAF(); void SetAFAndFixup(Ref AF); Ref CalculateAFForDecimal(Ref A); void CalculatePF(Ref Res); void CalculateAF(Ref Src1, Ref Src2); Ref IncrementByCarry(OpSize OpSize, Ref Src); void CalculateOF(IR::OpSize SrcSize, Ref Res, Ref Src1, Ref Src2, bool Sub); Ref CalculateFlags_ADC(IR::OpSize SrcSize, Ref Src1, Ref Src2); Ref CalculateFlags_SBB(IR::OpSize SrcSize, Ref Src1, Ref Src2); Ref CalculateFlags_SUB(IR::OpSize SrcSize, Ref Src1, Ref Src2, bool UpdateCF = true); Ref CalculateFlags_ADD(IR::OpSize SrcSize, Ref Src1, Ref Src2, bool UpdateCF = true); void CalculateFlags_MUL(IR::OpSize SrcSize, Ref Res, Ref High); void CalculateFlags_UMUL(Ref High); void CalculateFlags_Logical(IR::OpSize SrcSize, Ref Res); void CalculateFlags_ShiftLeftImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift); void CalculateFlags_ShiftRightImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift); void CalculateFlags_ShiftRightDoubleImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift); void CalculateFlags_ShiftRightImmediateCommon(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift); void CalculateFlags_SignShiftRightImmediate(IR::OpSize SrcSize, Ref Res, Ref Src1, uint64_t Shift); void CalculateFlags_ZCNT(IR::OpSize SrcSize, Ref Result); /** @} */ Ref GetX87Top(); void SetX87FTW(Ref FTW); Ref GetX87FTW_Helper(); void SetX87Top(Ref Value); void ChgStateX87_MMX() override { LOGMAN_THROW_A_FMT(MMXState == MMXState_X87, "Expected state to be x87"); _StackForceSlow(); SetX87Top(Constant(0)); // top reset to zero _StoreContextGPR(OpSize::i8Bit, Constant(0xFFFFUL), offsetof(FEXCore::Core::CPUState, AbridgedFTW)); MMXState = MMXState_MMX; } void ChgStateMMX_X87() override { LOGMAN_THROW_A_FMT(MMXState == MMXState_MMX, "Expected state to be MMX"); // The opcode dispatcher register cache is used for MMX, but the x87 pass register cache is used for x87, spill to // context to ensure coherence. FlushRegisterCache(false, true); // We explicitly initialize to x87 state in StartNewBlock. // So if we ever change this to do something else, we need to // make sure that we consider if we need to explicitly set it there. MMXState = MMXState_X87; } bool DestIsLockedMem(FEXCore::X86Tables::DecodedOp Op) const { return DestIsMem(Op) && (Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK) != 0; } bool DestIsMem(FEXCore::X86Tables::DecodedOp Op) const { return !Op->Dest.IsGPR(); } void CreateJumpBlocks(const fextl::vector* Blocks); bool BlockSetRIP {false}; bool Multiblock {}; bool Is64BitMode {}; uint64_t Entry {}; // Set if mono hacks are enabled and the current block is the mono callsite backpatcher, in which case the // XCHG ops that would patch code are replaced with a hook that performs the write and manually invalidates // the target address. bool IsMonoBackpatcherBlock {false}; IROp_IRHeader* CurrentHeader {}; [[nodiscard]] bool IsTSOEnabled(RegClass Class) const { if (ForceTSO == ForceTSOMode::ForceEnabled) { return true; } else if (ForceTSO == ForceTSOMode::ForceDisabled) { return false; } else if (Class == RegClass::FPR) { return CTX->IsVectorAtomicTSOEnabled(); } else { return CTX->IsAtomicTSOEnabled(); } } Ref _StoreMemAutoTSO(RegClass Class, OpSize Size, Ref Addr, Ref Value, OpSize Align = OpSize::i8Bit) { if (IsTSOEnabled(Class)) { return _StoreMemTSO(Class, Size, Value, Addr, Invalid(), Align, MemOffsetType::SXTX, 1); } else { return _StoreMem(Class, Size, Value, Addr, Invalid(), Align, MemOffsetType::SXTX, 1); } } Ref _StoreMemGPRAutoTSO(OpSize Size, Ref Addr, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMemAutoTSO(RegClass::GPR, Size, Addr, Value, Align); } Ref _StoreMemFPRAutoTSO(OpSize Size, Ref Addr, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMemAutoTSO(RegClass::FPR, Size, Addr, Value, Align); } Ref _LoadMemAutoTSO(RegClass Class, OpSize Size, Ref ssa0, OpSize Align = OpSize::i8Bit) { if (IsTSOEnabled(Class)) { return _LoadMemTSO(Class, Size, ssa0, Invalid(), Align, MemOffsetType::SXTX, 1); } else { return _LoadMem(Class, Size, ssa0, Invalid(), Align, MemOffsetType::SXTX, 1); } } Ref _LoadMemGPRAutoTSO(OpSize Size, Ref ssa0, OpSize Align = OpSize::i8Bit) { return _LoadMemAutoTSO(RegClass::GPR, Size, ssa0, Align); } Ref _LoadMemFPRAutoTSO(OpSize Size, Ref ssa0, OpSize Align = OpSize::i8Bit) { return _LoadMemAutoTSO(RegClass::FPR, Size, ssa0, Align); } Ref _LoadMemAutoTSO(RegClass Class, OpSize Size, const AddressMode& A, OpSize Align = OpSize::i8Bit) { const bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO; const auto B = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, AtomicTSO, Class != RegClass::GPR, Size); if (AtomicTSO) { return _LoadMemTSO(Class, Size, B.Base, B.Index, Align, B.IndexType, B.IndexScale); } else { return _LoadMem(Class, Size, B.Base, B.Index, Align, B.IndexType, B.IndexScale); } } Ref _LoadMemGPRAutoTSO(OpSize Size, const AddressMode& A, OpSize Align = OpSize::i8Bit) { return _LoadMemAutoTSO(RegClass::GPR, Size, A, Align); } Ref _LoadMemFPRAutoTSO(OpSize Size, const AddressMode& A, OpSize Align = OpSize::i8Bit) { return _LoadMemAutoTSO(RegClass::FPR, Size, A, Align); } AddressMode SelectPairAddressMode(AddressMode A, IR::OpSize Size) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iUnsized, "Invalid size!"); const auto SizeInt = IR::OpSizeToSize(Size); AddressMode Out {}; signed OffsetEl = A.Offset / SizeInt; if ((A.Offset % SizeInt) == 0 && OffsetEl >= -64 && OffsetEl < 64) { Out.Offset = A.Offset; A.Offset = 0; } Out.Base = LoadEffectiveAddress(this, A, GetGPROpSize(), true, false); return Out; } RefPair LoadMemPair(RegClass Class, OpSize Size, Ref Base, uint32_t Offset) { RefPair Values = AllocatePair(Class, Size); _LoadMemPair(Class, Size, Base, Offset, Values.Low, Values.High); return Values; } RefPair LoadMemPairFPR(OpSize Size, Ref Base, uint32_t Offset) { return LoadMemPair(RegClass::FPR, Size, Base, Offset); } RefPair _LoadMemPairAutoTSO(RegClass Class, OpSize Size, const AddressMode& A, OpSize Align = OpSize::i8Bit) { const bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO; // Use ldp if possible, otherwise fallback on two loads. if (!AtomicTSO && !A.Segment && Size >= OpSize::i32Bit && Size <= OpSize::i128Bit) { const auto B = SelectPairAddressMode(A, Size); return LoadMemPair(Class, Size, B.Base, B.Offset); } AddressMode HighA = A; HighA.Offset += 16; return { .Low = _LoadMemAutoTSO(Class, Size, A, Align), .High = _LoadMemAutoTSO(Class, Size, HighA, Align), }; } RefPair _LoadMemPairFPRAutoTSO(OpSize Size, const AddressMode& A, OpSize Align = OpSize::i8Bit) { return _LoadMemPairAutoTSO(RegClass::FPR, Size, A, Align); } Ref _StoreMemAutoTSO(RegClass Class, OpSize Size, const AddressMode& A, Ref Value, OpSize Align = OpSize::i8Bit) { const bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO; const auto B = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, AtomicTSO, Class != RegClass::GPR, Size); if (AtomicTSO) { return _StoreMemTSO(Class, Size, Value, B.Base, B.Index, Align, B.IndexType, B.IndexScale); } else { return _StoreMem(Class, Size, Value, B.Base, B.Index, Align, B.IndexType, B.IndexScale); } } Ref _StoreMemGPRAutoTSO(OpSize Size, const AddressMode& A, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMemAutoTSO(RegClass::GPR, Size, A, Value, Align); } Ref _StoreMemFPRAutoTSO(OpSize Size, const AddressMode& A, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMemAutoTSO(RegClass::FPR, Size, A, Value, Align); } void _StoreMemPairAutoTSO(RegClass Class, OpSize Size, const AddressMode& A, Ref Value1, Ref Value2, OpSize Align = OpSize::i8Bit) { const auto SizeInt = IR::OpSizeToSize(Size); const bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO; // Use stp if possible, otherwise fallback on two stores. if (!AtomicTSO && !A.Segment && Size >= OpSize::i32Bit && Size <= OpSize::i128Bit) { const auto B = SelectPairAddressMode(A, Size); _StoreMemPair(Class, Size, Value1, Value2, B.Base, B.Offset); } else { auto B = A; _StoreMemAutoTSO(Class, Size, B, Value1, OpSize::i8Bit); B.Offset += SizeInt; _StoreMemAutoTSO(Class, Size, B, Value2, OpSize::i8Bit); } } void _StoreMemPairFPRAutoTSO(OpSize Size, const AddressMode& A, Ref Value1, Ref Value2, OpSize Align = OpSize::i8Bit) { return _StoreMemPairAutoTSO(RegClass::FPR, Size, A, Value1, Value2, Align); } Ref Pop(IR::OpSize Size, Ref SP_RMW) { Ref Value = _AllocateGPR(false); _Pop(Size, SP_RMW, Value); return Value; } Ref Pop(IR::OpSize Size) { Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP)); Ref Value = _AllocateGPR(false); _Pop(Size, SP, Value); // Store the new stack pointer StoreGPRRegister(X86State::REG_RSP, SP); return Value; } Ref VZeroExtendOperand(OpSize Size, X86Tables::DecodedOperand Op, Ref Value) { bool IsMMX = Op.IsGPR() && Op.Data.GPR.GPR >= X86State::REG_MM_0; bool AlreadyExtended = Op.IsGPRDirect() || Op.IsGPRIndirect() || IsMMX; return AlreadyExtended ? Value : _VMov(Size, Value); } void Push(IR::OpSize Size, Ref Value) { auto OldSP = LoadGPRRegister(X86State::REG_RSP); auto NewSP = _Push(GetGPROpSize(), Size, Value, OldSP); StoreGPRRegister(X86State::REG_RSP, NewSP); FlushRegisterCache(); } struct ArithRef { IREmitter* E {}; bool IsConstant {}; union { Ref R {}; uint64_t C; }; ArithRef() {} ArithRef(IREmitter* IREmit, Ref Reference) : E(IREmit) , IsConstant(false) , R(Reference) {} ArithRef(IREmitter* IREmit, uint64_t K) : E(IREmit) , IsConstant(true) , C(K) {} ArithRef Neg() { return IsConstant ? ArithRef(E, -C) : ArithRef(E, E->_Neg(OpSize::i64Bit, R)); } ArithRef And(uint64_t K) { return IsConstant ? ArithRef(E, C & K) : ArithRef(E, E->_And(OpSize::i64Bit, R, E->Constant(K))); } ArithRef Presub(uint64_t K) { return IsConstant ? ArithRef(E, K - C) : ArithRef(E, E->Sub(OpSize::i64Bit, E->Constant(K), R)); } ArithRef Lshl(uint64_t Shift) { if (Shift == 0) { return *this; } else if (IsConstant) { return ArithRef(E, C << Shift); } else { return ArithRef(E, E->_Lshl(OpSize::i64Bit, R, E->Constant(Shift))); } } ArithRef Bfe(unsigned Start, unsigned Size) { if (IsConstant) { return ArithRef(E, (C >> Start) & ((1ull << Size) - 1)); } else { return ArithRef(E, E->_Bfe(OpSize::i64Bit, Size, Start, R)); } } ArithRef Sbfe(unsigned Start, unsigned Size) { if (IsConstant) { uint64_t SourceMask = Size == 64 ? ~0ULL : ((1ULL << Size) - 1); SourceMask <<= Start; int64_t NewConstant = (C & SourceMask) >> Start; NewConstant <<= 64 - Size; NewConstant >>= 64 - Size; return ArithRef(E, NewConstant); } else { return ArithRef(E, E->_Sbfe(OpSize::i64Bit, Size, Start, R)); } } Ref BfiInto(Ref Bitfield, unsigned Start, unsigned Size) { if (IsConstant && (Size > 0 && Size < 64)) { uint64_t SourceMask = (1ULL << Size) - 1; uint64_t SourceMaskShifted = SourceMask << Start; if (C == 0) { return E->_And(OpSize::i64Bit, Bitfield, E->_InlineConstant(~SourceMaskShifted)); } else if (C == SourceMask) { return E->_Or(OpSize::i64Bit, Bitfield, E->_InlineConstant(SourceMaskShifted)); } } if (IsConstant) { return E->_Bfi(OpSize::i64Bit, Size, Start, Bitfield, E->Constant(C)); } else { return E->_Bfi(OpSize::i64Bit, Size, Start, Bitfield, R); } } ArithRef MaskBit(OpSize Size) { if (IsConstant) { uint64_t ShiftMask = Size == OpSize::i64Bit ? 63 : 31; uint64_t Result = 1ull << (C & ShiftMask); if (ShiftMask == 31) { Result &= ((1ull << 32) - 1); } return ArithRef(E, Result); } else { return ArithRef(E, E->_Lshl(Size, E->Constant(1), R)); } } Ref Ref() { return IsConstant ? E->Constant(C) : R; } bool IsDefinitelyZero() const { return IsConstant && C == 0; } }; ArithRef ARef(Ref R) { uint64_t C; if (IsValueConstant(WrapNode(R), &C)) { return ARef(C); } else { return ArithRef(this, R); } } ArithRef ARef(uint64_t K) { return ArithRef(this, K); } ///< Segment telemetry tracking uint32_t SegmentsNeedReadCheck {~0U}; void CheckLegacySegmentWrite(Ref NewNode, uint32_t SegmentReg); void CheckLegacySegmentRead(Ref NewNode, uint32_t SegmentReg); }; constexpr inline void InstallToTable(auto& FinalTable, const auto& LocalTable) { for (const auto& Op : LocalTable) { auto OpNum = Op.Op; auto Dispatcher = Op.Ptr; for (uint8_t i = 0; i < Op.Count; ++i) { auto& TableOp = FinalTable[OpNum + i]; #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED if (TableOp.OpcodeDispatcher.OpDispatch) { ERROR_AND_DIE_FMT("Duplicate Entry {}", TableOp.Name); } #endif TableOp.OpcodeDispatcher.OpDispatch = Dispatcher; } } } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/Core/VSyscall/VSyscall.inc ================================================ // SPDX-License-Identifier: MIT // This is the vsyscall page for x86_64 guest code // This was compiled with nasm with the following source then exported to binary //BITS 64; // //align 4096, db 0xcc // ; __NR_gettimeofday // mov rax, 96 // syscall // ret // //align 1024, db 0xcc // ; __NR_time // mov rax, 201 // syscall // ret // //align 1024, db 0xcc // ; __NR_getcpu // mov rax, 309 // syscall // ret // //align 4096, db 0xcc // We only want one of these pages per FEX process // One page const static uint8_t VSyscallData[0x1000] = { 0xB8, 0x60, 0x00, 0x00, 0x00, 0x0F, 0x05, 0xC3, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xB8, 0xC9, 0x00, 0x00, 0x00, 0x0F, 0x05, 0xC3, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xB8, 0x35, 0x01, 0x00, 0x00, 0x0F, 0x05, 0xC3, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC }; ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/BaseTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/BaseTables.h" #include #include namespace FEXCore::X86Tables { using namespace InstFlags; enum Primary_LUT { ENTRY_06, ENTRY_07, ENTRY_0E, ENTRY_16, ENTRY_17, ENTRY_1E, ENTRY_1F, ENTRY_27, ENTRY_2F, ENTRY_37, ENTRY_3F, ENTRY_40, ENTRY_48, ENTRY_60, ENTRY_61, ENTRY_63, ENTRY_9A, ENTRY_A0, ENTRY_A1, ENTRY_A2, ENTRY_A3, ENTRY_CE, ENTRY_D4, ENTRY_D5, ENTRY_D6, ENTRY_EA, ENTRY_MAX, }; constexpr std::array Primary_ArchSelect_LUT = {{ // ENTRY_06 { {"PUSH ES", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_07 { {"POP ES", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_DEF) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_0E { {"PUSH CS", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_16 { {"PUSH SS", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_17 { {"POP SS", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_DEF) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_1E { {"PUSH DS", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_1F { {"POP DS", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_DEF) | FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX> } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_27 { {"DAA", TYPE_INST, GenFlagsDstSize(SIZE_8BIT) | FLAGS_SF_DST_RAX, 0, { .OpDispatch = &IR::OpDispatchBuilder::DAAOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_2F { {"DAS", TYPE_INST, GenFlagsDstSize(SIZE_8BIT) | FLAGS_SF_DST_RAX, 0, { .OpDispatch = &IR::OpDispatchBuilder::DASOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_37 { {"AAA", TYPE_INST, GenFlagsDstSize(SIZE_16BIT) | FLAGS_SF_DST_RAX, 0, { .OpDispatch = &IR::OpDispatchBuilder::AAAOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_3F { {"AAS", TYPE_INST, GenFlagsDstSize(SIZE_16BIT) | FLAGS_SF_DST_RAX, 0, { .OpDispatch = &IR::OpDispatchBuilder::AASOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_40 { {"INC", TYPE_INST, FLAGS_SF_REX_IN_BYTE, 0, { .OpDispatch = &IR::OpDispatchBuilder::INCOp } }, // REX {"", TYPE_REX_PREFIX, FLAGS_NONE, 0}, }, // ENTRY_48 { {"DEC", TYPE_INST, FLAGS_SF_REX_IN_BYTE, 0, { .OpDispatch = &IR::OpDispatchBuilder::DECOp } }, {"", TYPE_REX_PREFIX, FLAGS_NONE, 0}, }, // ENTRY_60 { {"PUSHA", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::PUSHAOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_61 { {"POPA", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS, 0, { .OpDispatch = &IR::OpDispatchBuilder::POPAOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_63 { {"ARPL", TYPE_INST, GenFlagsSameSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0, { .OpDispatch = &IR::OpDispatchBuilder::ARPLOp } }, {"MOVSXD", TYPE_INST, GenFlagsDstSize(SIZE_64BIT) | FLAGS_MODRM, 0, { .OpDispatch = &IR::OpDispatchBuilder::MOVSXDOp } }, }, // ENTRY_9A { {"CALLF", TYPE_INST, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_A0 { {"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX | FLAGS_MEM_OFFSET, 4, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, {"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX | FLAGS_MEM_OFFSET, 8, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, }, // ENTRY_A1 { {"MOV", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_MEM_OFFSET, 4, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, {"MOV", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_MEM_OFFSET, 8, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, }, // ENTRY_A2 { {"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_SRC_RAX | FLAGS_MEM_OFFSET, 4, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, {"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_SRC_RAX | FLAGS_MEM_OFFSET, 8, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, }, // ENTRY_A3 { {"MOV", TYPE_INST, FLAGS_SF_SRC_RAX | FLAGS_MEM_OFFSET, 4, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, {"MOV", TYPE_INST, FLAGS_SF_SRC_RAX | FLAGS_MEM_OFFSET, 8, { .OpDispatch = &IR::OpDispatchBuilder::MOVOffsetOp } }, }, // ENTRY_CE { {"INTO", TYPE_INST, FLAGS_NONE, 0, { .OpDispatch = &IR::OpDispatchBuilder::INTOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_D4 { {"AAM", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX, 1, { .OpDispatch = &IR::OpDispatchBuilder::AAMOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_D5 { {"AAD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX, 1, { .OpDispatch = &IR::OpDispatchBuilder::AADOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_D6 { {"SALC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX | FLAGS_SF_SRC_RAX, 0, { .OpDispatch = &IR::OpDispatchBuilder::SALCOp } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, // ENTRY_EA { {"JMPF", TYPE_INST, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, }}; const std::array BaseOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct BaseOpTable[] = { // Prefixes // Operand size overide {0x66, 1, X86InstInfo{"", TYPE_PREFIX, FLAGS_NONE, 0}}, // Address size override {0x67, 1, X86InstInfo{"", TYPE_PREFIX, FLAGS_NONE, 0}}, {0x26, 1, X86InstInfo{"ES", TYPE_LEGACY_PREFIX, FLAGS_NONE, 0}}, {0x2E, 1, X86InstInfo{"CS", TYPE_LEGACY_PREFIX, FLAGS_NONE, 0}}, {0x36, 1, X86InstInfo{"SS", TYPE_LEGACY_PREFIX, FLAGS_NONE, 0}}, {0x3E, 1, X86InstInfo{"DS", TYPE_LEGACY_PREFIX, FLAGS_NONE, 0}}, // These are still invalid on 64bit {0x64, 1, X86InstInfo{"FS", TYPE_PREFIX, FLAGS_NONE, 0}}, {0x65, 1, X86InstInfo{"GS", TYPE_PREFIX, FLAGS_NONE, 0}}, {0xF0, 1, X86InstInfo{"LOCK", TYPE_PREFIX, FLAGS_NONE, 0}}, {0xF2, 1, X86InstInfo{"REPNE", TYPE_PREFIX, FLAGS_NONE, 0}}, {0xF3, 1, X86InstInfo{"REP", TYPE_PREFIX, FLAGS_NONE, 0}}, // Instructions {0x00, 1, X86InstInfo{"ADD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x01, 1, X86InstInfo{"ADD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_DISPLACE_SIZE_DIV_2, 0}}, {0x02, 1, X86InstInfo{"ADD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x03, 1, X86InstInfo{"ADD", TYPE_INST, FLAGS_MODRM, 0}}, {0x04, 1, X86InstInfo{"ADD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x05, 1, X86InstInfo{"ADD", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x06, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_06] }}}, {0x07, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_07] }}}, {0x08, 1, X86InstInfo{"OR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x09, 1, X86InstInfo{"OR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x0A, 1, X86InstInfo{"OR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x0B, 1, X86InstInfo{"OR", TYPE_INST, FLAGS_MODRM, 0}}, {0x0C, 1, X86InstInfo{"OR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x0D, 1, X86InstInfo{"OR", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x0E, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_0E] }}}, {0x10, 1, X86InstInfo{"ADC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x11, 1, X86InstInfo{"ADC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_DISPLACE_SIZE_DIV_2, 0}}, {0x12, 1, X86InstInfo{"ADC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x13, 1, X86InstInfo{"ADC", TYPE_INST, FLAGS_MODRM, 0}}, {0x14, 1, X86InstInfo{"ADC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x15, 1, X86InstInfo{"ADC", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x16, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_16] }}}, {0x17, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_17] }}}, {0x18, 1, X86InstInfo{"SBB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x19, 1, X86InstInfo{"SBB", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_DISPLACE_SIZE_DIV_2, 0}}, {0x1A, 1, X86InstInfo{"SBB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x1B, 1, X86InstInfo{"SBB", TYPE_INST, FLAGS_MODRM, 0}}, {0x1C, 1, X86InstInfo{"SBB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x1D, 1, X86InstInfo{"SBB", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x1E, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_1E] }}}, {0x1F, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_1F] }}}, {0x20, 1, X86InstInfo{"AND", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x21, 1, X86InstInfo{"AND", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x22, 1, X86InstInfo{"AND", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x23, 1, X86InstInfo{"AND", TYPE_INST, FLAGS_MODRM, 0}}, {0x24, 1, X86InstInfo{"AND", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x25, 1, X86InstInfo{"AND", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x27, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_27] }}}, {0x28, 1, X86InstInfo{"SUB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x29, 1, X86InstInfo{"SUB", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x2A, 1, X86InstInfo{"SUB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x2B, 1, X86InstInfo{"SUB", TYPE_INST, FLAGS_MODRM, 0}}, {0x2C, 1, X86InstInfo{"SUB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x2D, 1, X86InstInfo{"SUB", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x2F, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_2F] }}}, {0x30, 1, X86InstInfo{"XOR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x31, 1, X86InstInfo{"XOR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x32, 1, X86InstInfo{"XOR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x33, 1, X86InstInfo{"XOR", TYPE_INST, FLAGS_MODRM, 0}}, {0x34, 1, X86InstInfo{"XOR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x35, 1, X86InstInfo{"XOR", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x37, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_37] }}}, {0x38, 1, X86InstInfo{"CMP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x39, 1, X86InstInfo{"CMP", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x3A, 1, X86InstInfo{"CMP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x3B, 1, X86InstInfo{"CMP", TYPE_INST, FLAGS_MODRM, 0}}, {0x3C, 1, X86InstInfo{"CMP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0x3D, 1, X86InstInfo{"CMP", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x3F, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_3F] }}}, {0x40, 8, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_40] }}}, {0x48, 8, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_48] }}}, {0x50, 8, X86InstInfo{"PUSH", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SF_REX_IN_BYTE | FLAGS_DEBUG_MEM_ACCESS , 0}}, {0x58, 8, X86InstInfo{"POP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SF_REX_IN_BYTE | FLAGS_DEBUG_MEM_ACCESS , 0}}, {0x60, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_60] }}}, {0x61, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_61] }}}, {0x62, 1, X86InstInfo{"", TYPE_GROUP_EVEX, FLAGS_NONE, 0}}, {0x63, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_63] }}}, {0x68, 1, X86InstInfo{"PUSH", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_SRC_SEXT, 4}}, {0x69, 1, X86InstInfo{"IMUL", TYPE_INST, FLAGS_MODRM | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0x6A, 1, X86InstInfo{"PUSH", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_SRC_SEXT , 1}}, {0x6B, 1, X86InstInfo{"IMUL", TYPE_INST, FLAGS_MODRM | FLAGS_SRC_SEXT , 1}}, // This should just throw a GP {0x6C, 1, X86InstInfo{"INSB", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0x6D, 1, X86InstInfo{"INSW", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0x6E, 1, X86InstInfo{"OUTS", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0x6F, 1, X86InstInfo{"OUTS", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0x70, 1, X86InstInfo{"JO", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x71, 1, X86InstInfo{"JNO", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x72, 1, X86InstInfo{"JB", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x73, 1, X86InstInfo{"JNB", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x74, 1, X86InstInfo{"JZ", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x75, 1, X86InstInfo{"JNZ", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x76, 1, X86InstInfo{"JBE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x77, 1, X86InstInfo{"JNBE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x78, 1, X86InstInfo{"JS", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x79, 1, X86InstInfo{"JNS", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x7A, 1, X86InstInfo{"JP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x7B, 1, X86InstInfo{"JNP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x7C, 1, X86InstInfo{"JL", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x7D, 1, X86InstInfo{"JNL", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x7E, 1, X86InstInfo{"JLE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x7F, 1, X86InstInfo{"JNLE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, {0x84, 1, X86InstInfo{"TEST", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x85, 1, X86InstInfo{"TEST", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x86, 1, X86InstInfo{"XCHG", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x87, 1, X86InstInfo{"XCHG", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x88, 1, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x89, 1, X86InstInfo{"MOV", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x8A, 1, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM, 0}}, {0x8B, 1, X86InstInfo{"MOV", TYPE_INST, FLAGS_MODRM, 0}}, {0x8C, 1, X86InstInfo{"MOV", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0x8D, 1, X86InstInfo{"LEA", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_MODRM, 0}}, {0x8E, 1, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {0x8F, 1, X86InstInfo{"POP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_ZERO_REG | FLAGS_DEBUG_MEM_ACCESS, 0}}, {0x90, 8, X86InstInfo{"XCHG", TYPE_INST, FLAGS_SF_REX_IN_BYTE | FLAGS_SF_SRC_RAX, 0}}, {0x98, 1, X86InstInfo{"CDQE", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SF_SRC_RAX, 0}}, {0x99, 1, X86InstInfo{"CQO", TYPE_INST, FLAGS_SF_DST_RDX | FLAGS_SF_SRC_RAX, 0}}, // These three are all X87 instructions {0x9B, 1, X86InstInfo{"FWAIT", TYPE_INST, FLAGS_NONE, 0}}, {0x9C, 1, X86InstInfo{"PUSHF", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF), 0}}, {0x9D, 1, X86InstInfo{"POPF", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_BLOCK_END, 0}}, {0x9E, 1, X86InstInfo{"SAHF", TYPE_INST, FLAGS_NONE, 0}}, {0x9F, 1, X86InstInfo{"LAHF", TYPE_INST, FLAGS_NONE, 0}}, {0xA0, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_A0] }}}, {0xA1, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_A1] }}}, {0xA2, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_A2] }}}, {0xA3, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_A3] }}}, {0xA4, 1, X86InstInfo{"MOVSB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xA5, 1, X86InstInfo{"MOVS", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xA6, 1, X86InstInfo{"CMPSB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xA7, 1, X86InstInfo{"CMPS", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xA8, 1, X86InstInfo{"TEST", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX , 1}}, {0xA9, 1, X86InstInfo{"TEST", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {0xAA, 1, X86InstInfo{"STOS", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_SF_SRC_RAX, 0}}, {0xAB, 1, X86InstInfo{"STOS", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS | FLAGS_SF_SRC_RAX, 0}}, {0xAC, 1, X86InstInfo{"LODS", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_DST_RAX | FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xAD, 1, X86InstInfo{"LODS", TYPE_INST, FLAGS_SF_DST_RAX | FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xAE, 1, X86InstInfo{"SCAS", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_SF_SRC_RAX, 0}}, {0xAF, 1, X86InstInfo{"SCAS", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS | FLAGS_SF_SRC_RAX, 0}}, {0xB0, 8, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_SF_REX_IN_BYTE , 1}}, {0xB8, 8, X86InstInfo{"MOV", TYPE_INST, FLAGS_SF_REX_IN_BYTE | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_DISPLACE_SIZE_MUL_2, 4}}, {0xC2, 1, X86InstInfo{"RET", TYPE_INST, FLAGS_SETS_RIP | FLAGS_BLOCK_END, 2}}, {0xC3, 1, X86InstInfo{"RET", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_BLOCK_END , 0}}, {0xC8, 1, X86InstInfo{"ENTER", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_DEBUG_MEM_ACCESS , 3}}, {0xC9, 1, X86InstInfo{"LEAVE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_DEBUG_MEM_ACCESS , 0}}, {0xCA, 1, X86InstInfo{"RETF", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_BLOCK_END, 2}}, {0xCB, 1, X86InstInfo{"RETF", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_BLOCK_END, 0}}, {0xCC, 1, X86InstInfo{"INT3", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0xCD, 1, X86InstInfo{"INT", TYPE_INST, DEFAULT_SYSCALL_FLAGS, 1}}, {0xCE, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_CE] }}}, {0xCF, 1, X86InstInfo{"IRET", TYPE_INST, FLAGS_SETS_RIP | FLAGS_BLOCK_END, 0}}, {0xD4, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_D4] }}}, {0xD5, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_D5] }}}, {0xD6, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Primary_ArchSelect_LUT[ENTRY_D6] }}}, {0xD7, 1, X86InstInfo{"XLAT", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS, 0}}, {0xE0, 1, X86InstInfo{"LOOPNE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_SF_SRC_RCX, 1}}, {0xE1, 1, X86InstInfo{"LOOPE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_SF_SRC_RCX, 1}}, {0xE2, 1, X86InstInfo{"LOOP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_SF_SRC_RCX, 1}}, {0xE3, 1, X86InstInfo{"JrCXZ", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT , 1}}, // Should just throw GP {0xE4, 2, X86InstInfo{"IN", TYPE_INST, FLAGS_BLOCK_END, 1}}, {0xE6, 2, X86InstInfo{"OUT", TYPE_INST, FLAGS_BLOCK_END, 1}}, {0xE8, 1, X86InstInfo{"CALL", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_BLOCK_END | FLAGS_CALL , 4}}, {0xE9, 1, X86InstInfo{"JMP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_BLOCK_END , 4}}, {0xEB, 1, X86InstInfo{"JMP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_BLOCK_END , 1}}, // Should just throw GP {0xEC, 2, X86InstInfo{"IN", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0xEE, 2, X86InstInfo{"OUT", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0xF1, 1, X86InstInfo{"INT1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0xF4, 1, X86InstInfo{"HLT", TYPE_INST, FLAGS_BLOCK_END, 0}}, {0xF5, 1, X86InstInfo{"CMC", TYPE_INST, FLAGS_NONE, 0}}, {0xF8, 1, X86InstInfo{"CLC", TYPE_INST, FLAGS_NONE, 0}}, {0xF9, 1, X86InstInfo{"STC", TYPE_INST, FLAGS_NONE, 0}}, {0xFA, 1, X86InstInfo{"CLI", TYPE_INST, FLAGS_NONE, 0}}, {0xFB, 1, X86InstInfo{"STI", TYPE_INST, FLAGS_NONE, 0}}, {0xFC, 1, X86InstInfo{"CLD", TYPE_INST, FLAGS_NONE, 0}}, {0xFD, 1, X86InstInfo{"STD", TYPE_INST, FLAGS_NONE, 0}}, // Two Byte table {0x0F, 1, X86InstInfo{"", TYPE_SECONDARY_TABLE_PREFIX, FLAGS_NONE, 0}}, // x87 table {0xD8, 8, X86InstInfo{"", TYPE_X87_TABLE_PREFIX, FLAGS_MODRM, 0}}, // ModRM table // MoreBytes field repurposed for valid bits mask {0x80, 1, X86InstInfo{"", TYPE_GROUP_1, FLAGS_MODRM, 0}}, {0x81, 1, X86InstInfo{"", TYPE_GROUP_1, FLAGS_MODRM, 1}}, {0x82, 1, X86InstInfo{"", TYPE_GROUP_1, FLAGS_MODRM, 2}}, {0x83, 1, X86InstInfo{"", TYPE_GROUP_1, FLAGS_MODRM, 3}}, {0xC0, 1, X86InstInfo{"", TYPE_GROUP_2, FLAGS_MODRM, 0}}, {0xC1, 1, X86InstInfo{"", TYPE_GROUP_2, FLAGS_MODRM, 1}}, {0xD0, 1, X86InstInfo{"", TYPE_GROUP_2, FLAGS_MODRM, 2}}, {0xD1, 1, X86InstInfo{"", TYPE_GROUP_2, FLAGS_MODRM, 3}}, {0xD2, 1, X86InstInfo{"", TYPE_GROUP_2, FLAGS_MODRM, 4}}, {0xD3, 1, X86InstInfo{"", TYPE_GROUP_2, FLAGS_MODRM, 5}}, {0xF6, 1, X86InstInfo{"", TYPE_GROUP_3, FLAGS_MODRM, 0}}, {0xF7, 1, X86InstInfo{"", TYPE_GROUP_3, FLAGS_MODRM, 1}}, {0xFE, 1, X86InstInfo{"", TYPE_GROUP_4, FLAGS_MODRM, 0}}, {0xFF, 1, X86InstInfo{"", TYPE_GROUP_5, FLAGS_MODRM, 0}}, // Group 11 {0xC6, 1, X86InstInfo{"", TYPE_GROUP_11, FLAGS_MODRM, 0}}, {0xC7, 1, X86InstInfo{"", TYPE_GROUP_11, FLAGS_MODRM, 1}}, // VEX table {0xC4, 2, X86InstInfo{"", TYPE_VEX_TABLE_PREFIX, FLAGS_NONE, 0}}, }; GenerateTable(Table.data(), BaseOpTable, std::size(BaseOpTable)); IR::InstallToTable(Table, IR::OpDispatch_BaseOpTable); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/DDDTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/DDDTables.h" #include namespace FEXCore::X86Tables { using namespace InstFlags; constexpr std::array DDDNowOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct DDDNowOpTable[] = { {0x0C, 1, X86InstInfo{"PI2FW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x0D, 1, X86InstInfo{"PI2FD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x1C, 1, X86InstInfo{"PF2IW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x1D, 1, X86InstInfo{"PF2ID", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, // Inverse 3DNow! These two instructions are Geode product line specific // No CPUID for these, you're expected to read ID_CONFIG_MSR (1250h) bit 1 {0x86, 1, X86InstInfo{"PFRCPV", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x87, 1, X86InstInfo{"PFRSQRTV", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x8A, 1, X86InstInfo{"PFNACC", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x8E, 1, X86InstInfo{"PFPNACC", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x90, 1, X86InstInfo{"PFCMPGE", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x94, 1, X86InstInfo{"PFMIN", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x96, 1, X86InstInfo{"PFRCP", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x97, 1, X86InstInfo{"PFRSQRT", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x9A, 1, X86InstInfo{"PFSUB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x9E, 1, X86InstInfo{"PFADD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xA0, 1, X86InstInfo{"PFCMPGT", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xA4, 1, X86InstInfo{"PFMAX", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xA6, 1, X86InstInfo{"PFRCPIT1", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xA7, 1, X86InstInfo{"PFRSQIT1", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xAA, 1, X86InstInfo{"PFSUBR", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xAE, 1, X86InstInfo{"PFACC", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xB0, 1, X86InstInfo{"PFCMPEQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xB4, 1, X86InstInfo{"PFMUL", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xB6, 1, X86InstInfo{"PFRCPIT2", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xB7, 1, X86InstInfo{"PMULHRW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xBB, 1, X86InstInfo{"PSWAPD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xBF, 1, X86InstInfo{"PAVGUSB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, }; GenerateTable(Table.data(), DDDNowOpTable, std::size(DDDNowOpTable)); IR::InstallToTable(Table, IR::OpDispatch_DDDTable); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/H0F38Tables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/H0F38Tables.h" #include #include namespace FEXCore::X86Tables { using namespace InstFlags; constexpr std::array H0F38TableOps = []() consteval { std::array Table{}; #define OPD(prefix, opcode) (((prefix) << 8) | opcode) constexpr uint16_t PF_38_NONE = 0; constexpr uint16_t PF_38_66 = (1U << 0); constexpr uint16_t PF_38_F2 = (1U << 1); constexpr uint16_t PF_38_F3 = (1U << 2); constexpr U16U8InfoStruct H0F38Table[] = { {OPD(PF_38_NONE, 0x00), 1, X86InstInfo{"PSHUFB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x00), 1, X86InstInfo{"PSHUFB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x01), 1, X86InstInfo{"PHADDW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x01), 1, X86InstInfo{"PHADDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x02), 1, X86InstInfo{"PHADDD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x02), 1, X86InstInfo{"PHADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x03), 1, X86InstInfo{"PHADDSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x03), 1, X86InstInfo{"PHADDSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x04), 1, X86InstInfo{"PMADDUBSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x04), 1, X86InstInfo{"PMADDUBSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x05), 1, X86InstInfo{"PHSUBW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x05), 1, X86InstInfo{"PHSUBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x06), 1, X86InstInfo{"PHSUBD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x06), 1, X86InstInfo{"PHSUBD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x07), 1, X86InstInfo{"PHSUBSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x07), 1, X86InstInfo{"PHSUBSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x08), 1, X86InstInfo{"PSIGNB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x08), 1, X86InstInfo{"PSIGNB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x09), 1, X86InstInfo{"PSIGNW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x09), 1, X86InstInfo{"PSIGNW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x0A), 1, X86InstInfo{"PSIGND", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x0A), 1, X86InstInfo{"PSIGND", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x0B), 1, X86InstInfo{"PMULHRSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x0B), 1, X86InstInfo{"PMULHRSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x10), 1, X86InstInfo{"PBLENDVB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x14), 1, X86InstInfo{"BLENDVPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x15), 1, X86InstInfo{"BLENDVPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x17), 1, X86InstInfo{"PTEST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x1C), 1, X86InstInfo{"PABSB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x1C), 1, X86InstInfo{"PABSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x1D), 1, X86InstInfo{"PABSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x1D), 1, X86InstInfo{"PABSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0x1E), 1, X86InstInfo{"PABSD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {OPD(PF_38_66, 0x1E), 1, X86InstInfo{"PABSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x20), 1, X86InstInfo{"PMOVSXBW", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x21), 1, X86InstInfo{"PMOVSXBD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x22), 1, X86InstInfo{"PMOVSXBQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x23), 1, X86InstInfo{"PMOVSXWD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x24), 1, X86InstInfo{"PMOVSXWQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x25), 1, X86InstInfo{"PMOVSXDQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x28), 1, X86InstInfo{"PMULDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x29), 1, X86InstInfo{"PCMPEQQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x2A), 1, X86InstInfo{"MOVNTDQA", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x2B), 1, X86InstInfo{"PACKUSDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x30), 1, X86InstInfo{"PMOVZXBW", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x31), 1, X86InstInfo{"PMOVZXBD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x32), 1, X86InstInfo{"PMOVZXBQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x33), 1, X86InstInfo{"PMOVZXWD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x34), 1, X86InstInfo{"PMOVZXWQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x35), 1, X86InstInfo{"PMOVZXDQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x37), 1, X86InstInfo{"PCMPGTQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x38), 1, X86InstInfo{"PMINSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x39), 1, X86InstInfo{"PMINSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x3A), 1, X86InstInfo{"PMINUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x3B), 1, X86InstInfo{"PMINUD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x3C), 1, X86InstInfo{"PMAXSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x3D), 1, X86InstInfo{"PMAXSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x3E), 1, X86InstInfo{"PMAXUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x3F), 1, X86InstInfo{"PMAXUD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x40), 1, X86InstInfo{"PMULLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0x41), 1, X86InstInfo{"PHMINPOSUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xC8), 1, X86InstInfo{"SHA1NEXTE", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xC9), 1, X86InstInfo{"SHA1MSG1", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xCA), 1, X86InstInfo{"SHA1MSG2", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xCB), 1, X86InstInfo{"SHA256RNDS2", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xCC), 1, X86InstInfo{"SHA256MSG1", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xCD), 1, X86InstInfo{"SHA256MSG2", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0xDB), 1, X86InstInfo{"AESIMC", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0xDC), 1, X86InstInfo{"AESENC", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0xDD), 1, X86InstInfo{"AESENCLAST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0xDE), 1, X86InstInfo{"AESDEC", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_66, 0xDF), 1, X86InstInfo{"AESDECLAST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(PF_38_NONE, 0xF0), 1, X86InstInfo{"MOVBE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(PF_38_NONE, 0xF1), 1, X86InstInfo{"MOVBE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(PF_38_66, 0xF0), 1, X86InstInfo{"MOVBE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(PF_38_66, 0xF1), 1, X86InstInfo{"MOVBE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(PF_38_F2, 0xF0), 1, X86InstInfo{"CRC32", TYPE_INST, GenFlagsSizes(SIZE_DEF, SIZE_8BIT) | FLAGS_MODRM, 0}}, {OPD(PF_38_F2, 0xF1), 1, X86InstInfo{"CRC32", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(PF_38_66 | PF_38_F2, 0xF0), 1, X86InstInfo{"CRC32", TYPE_INST, GenFlagsSizes(SIZE_DEF, SIZE_8BIT) | FLAGS_MODRM, 0}}, {OPD(PF_38_66 | PF_38_F2, 0xF1), 1, X86InstInfo{"CRC32", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(PF_38_66, 0xF6), 1, X86InstInfo{"ADCX", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY66, 0}}, {OPD(PF_38_F3, 0xF6), 1, X86InstInfo{"ADOX", TYPE_INST, FLAGS_MODRM, 0}}, }; #undef OPD GenerateTable(Table.data(), H0F38Table, std::size(H0F38Table)); IR::InstallToTable(Table, IR::OpDispatch_H0F38Table); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/H0F3ATables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/H0F3ATables.h" #include #include #include namespace FEXCore::X86Tables { using namespace InstFlags; #define OPD(REX, prefix, opcode) ((REX << 9) | (prefix << 8) | opcode) constexpr uint16_t PF_3A_NONE = 0; constexpr uint16_t PF_3A_66 = 1; enum H0F3A_LUT { ENTRY_1_3A_66_16, ENTRY_1_3A_66_22, ENTRY_MAX, }; constexpr std::array H0F3A_ArchSelect_LUT = {{ // ENTRY_1_3A_66_16 { {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"PEXTRQ", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PExtrOp, IR::OpSize::i64Bit> }}, }, // ENTRY_1_3A_66_22 { {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"PINSRQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 1, { .OpDispatch = &IR::OpDispatchBuilder::PINSROp }}, }, }}; constexpr std::array H0F3ATableOps = []() consteval { std::array Table{}; auto TableGen = []() consteval { constexpr U16U8InfoStruct Table[] = { {OPD(REX, PF_3A_NONE, 0x0F), 1, X86InstInfo{"PALIGNR", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(REX, PF_3A_66, 0x08), 1, X86InstInfo{"ROUNDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x09), 1, X86InstInfo{"ROUNDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x0A), 1, X86InstInfo{"ROUNDSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x0B), 1, X86InstInfo{"ROUNDSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x0C), 1, X86InstInfo{"BLENDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x0D), 1, X86InstInfo{"BLENDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x0E), 1, X86InstInfo{"PBLENDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x0F), 1, X86InstInfo{"PALIGNR", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x14), 1, X86InstInfo{"PEXTRB", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x15), 1, X86InstInfo{"PEXTRW", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x17), 1, X86InstInfo{"EXTRACTPS", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x20), 1, X86InstInfo{"PINSRB", TYPE_INST, GenFlagsDstSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 1}}, {OPD(REX, PF_3A_66, 0x21), 1, X86InstInfo{"INSERTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x40), 1, X86InstInfo{"DPPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x41), 1, X86InstInfo{"DPPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x42), 1, X86InstInfo{"MPSADBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x44), 1, X86InstInfo{"PCLMULQDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x60), 1, X86InstInfo{"PCMPESTRM", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x61), 1, X86InstInfo{"PCMPESTRI", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x62), 1, X86InstInfo{"PCMPISTRM", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0x63), 1, X86InstInfo{"PCMPISTRI", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_NONE, 0xCC), 1, X86InstInfo{"SHA1RNDS4", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(REX, PF_3A_66, 0xDF), 1, X86InstInfo{"AESKEYGENASSIST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, }; return std::to_array(Table); }; constexpr auto H0F3ATable_IgnoresREX0 = TableGen.template operator()<0>(); constexpr auto H0F3ATable_IgnoresREX1 = TableGen.template operator()<1>(); GenerateTable(Table.data(), H0F3ATable_IgnoresREX0.data(), H0F3ATable_IgnoresREX0.size()); GenerateTable(Table.data(), H0F3ATable_IgnoresREX1.data(), H0F3ATable_IgnoresREX1.size()); constexpr U16U8InfoStruct TableNeedsREX0[] = { {OPD(0, PF_3A_66, 0x16), 1, X86InstInfo{"PEXTRD", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(0, PF_3A_66, 0x22), 1, X86InstInfo{"PINSRD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 1}}, }; GenerateTable(Table.data(), TableNeedsREX0, std::size(TableNeedsREX0)); constexpr U16U8InfoStruct TableNeedsREX1[] = { {OPD(1, PF_3A_66, 0x16), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = H0F3A_ArchSelect_LUT[ENTRY_1_3A_66_16] }}}, {OPD(1, PF_3A_66, 0x22), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = H0F3A_ArchSelect_LUT[ENTRY_1_3A_66_22] }}}, }; GenerateTable(Table.data(), TableNeedsREX1, std::size(TableNeedsREX1)); IR::InstallToTable(Table, IR::OpDispatch_H0F3ATableIgnoreREX); IR::InstallToTable(Table, IR::OpDispatch_H0F3ATableNeedsREX0); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/PrimaryGroupTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/PrimaryGroupTables.h" #include #include namespace FEXCore::X86Tables { using namespace InstFlags; enum PrimaryGroup_LUT { ENTRY_1_82_0, ENTRY_1_82_1, ENTRY_1_82_2, ENTRY_1_82_3, ENTRY_1_82_4, ENTRY_1_82_5, ENTRY_1_82_6, ENTRY_1_82_7, ENTRY_MAX, }; constexpr std::array PrimaryGroup_ArchSelect_LUT = {{ { {"ADD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::SecondaryALUOp }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"OR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::SecondaryALUOp }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"ADC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::ADCOp, 1> }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"SBB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::SBBOp, 1> }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"AND", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::SecondaryALUOp }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"SUB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::SecondaryALUOp }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"XOR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::SecondaryALUOp }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, { {"CMP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::CMPOp, 1> }}, {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, }, }}; constexpr std::array PrimaryInstGroupOps = []() consteval { std::array Table{}; #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_1) << 6) | (prefix) << 3 | (Reg)) constexpr U16U8InfoStruct PrimaryGroupOpTable[] = { // GROUP_1 | 0x80 | reg {OPD(TYPE_GROUP_1, OpToIndex(0x80), 0), 1, X86InstInfo{"ADD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 1), 1, X86InstInfo{"OR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 2), 1, X86InstInfo{"ADC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 3), 1, X86InstInfo{"SBB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 4), 1, X86InstInfo{"AND", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 5), 1, X86InstInfo{"SUB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 6), 1, X86InstInfo{"XOR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x80), 7), 1, X86InstInfo{"CMP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 0), 1, X86InstInfo{"ADD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 1), 1, X86InstInfo{"OR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 2), 1, X86InstInfo{"ADC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 3), 1, X86InstInfo{"SBB", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 4), 1, X86InstInfo{"AND", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 5), 1, X86InstInfo{"SUB", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 6), 1, X86InstInfo{"XOR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_1, OpToIndex(0x81), 7), 1, X86InstInfo{"CMP", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, // Duplicates the 0x80 opcode group {OPD(TYPE_GROUP_1, OpToIndex(0x82), 0), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_0] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 1), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_1] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 2), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_2] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 3), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_3] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 4), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_4] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 5), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_5] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 6), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_6] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x82), 7), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = PrimaryGroup_ArchSelect_LUT[ENTRY_1_82_7] }}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 0), 1, X86InstInfo{"ADD", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 1), 1, X86InstInfo{"OR", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 2), 1, X86InstInfo{"ADC", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 3), 1, X86InstInfo{"SBB", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 4), 1, X86InstInfo{"AND", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 5), 1, X86InstInfo{"SUB", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 6), 1, X86InstInfo{"XOR", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_1, OpToIndex(0x83), 7), 1, X86InstInfo{"CMP", TYPE_INST, FLAGS_SRC_SEXT | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, // GROUP 2 {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 0), 1, X86InstInfo{"ROL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 1), 1, X86InstInfo{"ROR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 2), 1, X86InstInfo{"RCL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 3), 1, X86InstInfo{"RCR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 4), 1, X86InstInfo{"SHL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 5), 1, X86InstInfo{"SHR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 6), 1, X86InstInfo{"SHL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC0), 7), 1, X86InstInfo{"SAR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 0), 1, X86InstInfo{"ROL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 1), 1, X86InstInfo{"ROR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 2), 1, X86InstInfo{"RCL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 3), 1, X86InstInfo{"RCR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 4), 1, X86InstInfo{"SHL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 5), 1, X86InstInfo{"SHR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 6), 1, X86InstInfo{"SHL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xC1), 7), 1, X86InstInfo{"SAR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 0), 1, X86InstInfo{"ROL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 1), 1, X86InstInfo{"ROR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 2), 1, X86InstInfo{"RCL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 3), 1, X86InstInfo{"RCR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 4), 1, X86InstInfo{"SHL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 5), 1, X86InstInfo{"SHR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 6), 1, X86InstInfo{"SHL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD0), 7), 1, X86InstInfo{"SAR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 0), 1, X86InstInfo{"ROL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 1), 1, X86InstInfo{"ROR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 2), 1, X86InstInfo{"RCL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 3), 1, X86InstInfo{"RCR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 4), 1, X86InstInfo{"SHL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 5), 1, X86InstInfo{"SHR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 6), 1, X86InstInfo{"SHL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD1), 7), 1, X86InstInfo{"SAR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 0), 1, X86InstInfo{"ROL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 1), 1, X86InstInfo{"ROR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 2), 1, X86InstInfo{"RCL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 3), 1, X86InstInfo{"RCR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 4), 1, X86InstInfo{"SHL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 5), 1, X86InstInfo{"SHR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 6), 1, X86InstInfo{"SHL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD2), 7), 1, X86InstInfo{"SAR", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 0), 1, X86InstInfo{"ROL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 1), 1, X86InstInfo{"ROR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 2), 1, X86InstInfo{"RCL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 3), 1, X86InstInfo{"RCR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 4), 1, X86InstInfo{"SHL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 5), 1, X86InstInfo{"SHR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 6), 1, X86InstInfo{"SHL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, {OPD(TYPE_GROUP_2, OpToIndex(0xD3), 7), 1, X86InstInfo{"SAR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX, 0}}, // GROUP 3 {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 0), 1, X86InstInfo{"TEST", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 1), 1, X86InstInfo{"TEST", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 2), 1, X86InstInfo{"NOT", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 3), 1, X86InstInfo{"NEG", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 4), 1, X86InstInfo{"MUL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 5), 1, X86InstInfo{"IMUL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 6), 1, X86InstInfo{"DIV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF6), 7), 1, X86InstInfo{"IDIV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 0), 1, X86InstInfo{"TEST", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 1), 1, X86InstInfo{"TEST", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT64BIT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 2), 1, X86InstInfo{"NOT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 3), 1, X86InstInfo{"NEG", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 4), 1, X86InstInfo{"MUL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 5), 1, X86InstInfo{"IMUL", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 6), 1, X86InstInfo{"DIV", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_3, OpToIndex(0xF7), 7), 1, X86InstInfo{"IDIV", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, // GROUP 4 {OPD(TYPE_GROUP_4, OpToIndex(0xFE), 0), 1, X86InstInfo{"INC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_4, OpToIndex(0xFE), 1), 1, X86InstInfo{"DEC", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_4, OpToIndex(0xFE), 2), 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 5 {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 0), 1, X86InstInfo{"INC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 1), 1, X86InstInfo{"DEC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 2), 1, X86InstInfo{"CALL", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_MODRM | FLAGS_BLOCK_END | FLAGS_CALL , 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 3), 1, X86InstInfo{"CALLF", TYPE_INST, FLAGS_SETS_RIP | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY | FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 4), 1, X86InstInfo{"JMP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_MODRM | FLAGS_BLOCK_END , 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 5), 1, X86InstInfo{"JMPF", TYPE_INST, FLAGS_SETS_RIP | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY | FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 6), 1, X86InstInfo{"PUSH", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_5, OpToIndex(0xFF), 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 11 {OPD(TYPE_GROUP_11, OpToIndex(0xC6), 0), 1, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT, 1}}, {OPD(TYPE_GROUP_11, OpToIndex(0xC6), 1), 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_11, OpToIndex(0xC6), 7), 1, X86InstInfo{"XABORT", TYPE_INST, FLAGS_MODRM, 1}}, {OPD(TYPE_GROUP_11, OpToIndex(0xC7), 0), 1, X86InstInfo{"MOV", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, {OPD(TYPE_GROUP_11, OpToIndex(0xC7), 1), 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_11, OpToIndex(0xC7), 7), 1, X86InstInfo{"XBEGIN", TYPE_INST, FLAGS_MODRM | FLAGS_SRC_SEXT | FLAGS_SETS_RIP | FLAGS_DISPLACE_SIZE_DIV_2, 4}}, }; GenerateTable(Table.data(), PrimaryGroupOpTable, std::size(PrimaryGroupOpTable)); IR::InstallToTable(Table, IR::OpDispatch_PrimaryGroupTables); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/SecondaryGroupTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/SecondaryGroupTables.h" #include #include namespace FEXCore::X86Tables { using namespace InstFlags; constexpr uint16_t PF_NONE = 0; constexpr uint16_t PF_F3 = 1; constexpr uint16_t PF_66 = 2; constexpr uint16_t PF_F2 = 3; #define OPD(group, prefix, Reg) (((group - FEXCore::X86Tables::TYPE_GROUP_6) << 5) | (prefix) << 3 | (Reg)) enum SecondGroup_LUT { ENTRY_15_F3_0, ENTRY_15_F3_1, ENTRY_15_F3_2, ENTRY_15_F3_3, ENTRY_MAX, }; constexpr std::array SecondGroup_ArchSelect_LUT = {{ // ENTRY_15_F3_0 { {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"RDFSBASE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::ReadSegmentReg, IR::OpDispatchBuilder::Segment::FS> } }, }, // ENTRY_15_F3_1 { {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"RDGSBASE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::ReadSegmentReg, IR::OpDispatchBuilder::Segment::GS> } }, }, // ENTRY_15_F3_2 { {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"WRFSBASE", TYPE_INST, GenFlagsDstSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::WriteSegmentReg, IR::OpDispatchBuilder::Segment::FS> } }, }, // ENTRY_15_F3_3 { {"", TYPE_INVALID, FLAGS_NONE, 0, { .OpDispatch = nullptr } }, {"WRGSBASE", TYPE_INST, GenFlagsDstSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::WriteSegmentReg, IR::OpDispatchBuilder::Segment::GS> } }, }, }}; constexpr std::array SecondInstGroupOps = []() consteval { std::array Table{}; constexpr U16U8InfoStruct SecondaryExtensionOpTable[] = { // GROUP 1 // GROUP 2 // GROUP 3 // GROUP 4 // GROUP 5 // Pulls from other MODRM table // GROUP 6 {OPD(TYPE_GROUP_6, PF_NONE, 0), 1, X86InstInfo{"SLDT", TYPE_UNDEC, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 1), 1, X86InstInfo{"STR", TYPE_PRIV, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 2), 1, X86InstInfo{"LLDT", TYPE_PRIV, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 3), 1, X86InstInfo{"LTR", TYPE_INST, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 4), 1, X86InstInfo{"VERR", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 5), 1, X86InstInfo{"VERW", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_NONE, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 0), 1, X86InstInfo{"SLDT", TYPE_UNDEC, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 1), 1, X86InstInfo{"STR", TYPE_PRIV, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 2), 1, X86InstInfo{"LLDT", TYPE_PRIV, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 3), 1, X86InstInfo{"LTR", TYPE_INST, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 4), 1, X86InstInfo{"VERR", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 5), 1, X86InstInfo{"VERW", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F3, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_66, 0), 1, X86InstInfo{"SLDT", TYPE_UNDEC, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_66, 1), 1, X86InstInfo{"STR", TYPE_PRIV, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_66, 2), 1, X86InstInfo{"LLDT", TYPE_PRIV, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_66, 3), 1, X86InstInfo{"LTR", TYPE_INST, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_66, 4), 1, X86InstInfo{"VERR", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_66, 5), 1, X86InstInfo{"VERW", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_66, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_66, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 0), 1, X86InstInfo{"SLDT", TYPE_UNDEC, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 1), 1, X86InstInfo{"STR", TYPE_PRIV, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 2), 1, X86InstInfo{"LLDT", TYPE_PRIV, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 3), 1, X86InstInfo{"LTR", TYPE_INST, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 4), 1, X86InstInfo{"VERR", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 5), 1, X86InstInfo{"VERW", TYPE_UNDEC, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_6, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 7 {OPD(TYPE_GROUP_7, PF_NONE, 0), 1, X86InstInfo{"SGDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 1), 1, X86InstInfo{"SIDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 2), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 3), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 4), 1, X86InstInfo{"SMSW", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 6), 1, X86InstInfo{"LMSW", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_7, PF_NONE, 7), 1, X86InstInfo{"INVLPG", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 0), 1, X86InstInfo{"SGDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 1), 1, X86InstInfo{"SIDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 2), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 3), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 4), 1, X86InstInfo{"SMSW", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 6), 1, X86InstInfo{"LMSW", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_7, PF_F3, 7), 1, X86InstInfo{"INVLPG", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_66, 0), 1, X86InstInfo{"SGDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_66, 1), 1, X86InstInfo{"SIDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_66, 2), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_66, 3), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_66, 4), 1, X86InstInfo{"SMSW", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_66, 6), 1, X86InstInfo{"LMSW", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_7, PF_66, 7), 1, X86InstInfo{"INVLPG", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 0), 1, X86InstInfo{"SGDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 1), 1, X86InstInfo{"SIDT", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 2), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 3), 1, X86InstInfo{"", TYPE_SECOND_GROUP_MODRM, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 4), 1, X86InstInfo{"SMSW", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 6), 1, X86InstInfo{"LMSW", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_7, PF_F2, 7), 1, X86InstInfo{"INVLPG", TYPE_SECOND_GROUP_MODRM, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, // GROUP 8 {OPD(TYPE_GROUP_8, PF_NONE, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_NONE, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_NONE, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_NONE, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_NONE, 4), 1, X86InstInfo{"BT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_NONE, 5), 1, X86InstInfo{"BTS", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_NONE, 6), 1, X86InstInfo{"BTR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_NONE, 7), 1, X86InstInfo{"BTC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F3, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F3, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F3, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F3, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F3, 4), 1, X86InstInfo{"BT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F3, 5), 1, X86InstInfo{"BTS", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F3, 6), 1, X86InstInfo{"BTR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F3, 7), 1, X86InstInfo{"BTC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_66, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_66, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_66, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_66, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_66, 4), 1, X86InstInfo{"BT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_66, 5), 1, X86InstInfo{"BTS", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_66, 6), 1, X86InstInfo{"BTR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_66, 7), 1, X86InstInfo{"BTC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F2, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_8, PF_F2, 4), 1, X86InstInfo{"BT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F2, 5), 1, X86InstInfo{"BTS", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F2, 6), 1, X86InstInfo{"BTR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, {OPD(TYPE_GROUP_8, PF_F2, 7), 1, X86InstInfo{"BTC", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 1}}, // GROUP 9 // AMD documentation is a bit broken for Group 9 // Claims the entire group has n/a applied for the prefix (Implies that the prefix is ignored) // RDRAND/RDSEED only work with no prefix (Other than 66h) // CMPXCHG8B/16B works with all prefixes // Tooling fails to decode CMPXCHG with prefix {OPD(TYPE_GROUP_9, PF_NONE, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 1), 1, X86InstInfo{"CMPXCHG8B/16B", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 6), 1, X86InstInfo{"RDRAND", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_NONE, 7), 1, X86InstInfo{"RDSEED", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 1), 1, X86InstInfo{"CMPXCHG8B/16B", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F3, 7), 1, X86InstInfo{"RDPID", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_66, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_66, 1), 1, X86InstInfo{"CMPXCHG8B/16B", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_66, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_66, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_66, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_66, 6), 1, X86InstInfo{"RDRAND", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_66, 7), 1, X86InstInfo{"RDSEED", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 1), 1, X86InstInfo{"CMPXCHG8B/16B", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_9, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 10 {OPD(TYPE_GROUP_10, PF_NONE, 0), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 1), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 2), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 3), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 4), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 5), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 6), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_NONE, 7), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 0), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 1), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 2), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 3), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 4), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 5), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 6), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F3, 7), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 0), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 1), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 2), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 3), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 4), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 5), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 6), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_66, 7), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 0), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 1), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 2), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 3), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 4), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 5), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 6), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, {OPD(TYPE_GROUP_10, PF_F2, 7), 1, X86InstInfo{"UD1", TYPE_INST, FLAGS_BLOCK_END, 0}}, // GROUP 12 {OPD(TYPE_GROUP_12, PF_NONE, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_NONE, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_NONE, 2), 1, X86InstInfo{"PSRLW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_12, PF_NONE, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_NONE, 4), 1, X86InstInfo{"PSRAW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_12, PF_NONE, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_NONE, 6), 1, X86InstInfo{"PSLLW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_12, PF_NONE, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_66, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_66, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_66, 2), 1, X86InstInfo{"PSRLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_12, PF_66, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_66, 4), 1, X86InstInfo{"PSRAW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_12, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_66, 6), 1, X86InstInfo{"PSLLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_12, PF_66, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F3, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_12, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 13 {OPD(TYPE_GROUP_13, PF_NONE, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_NONE, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_NONE, 2), 1, X86InstInfo{"PSRLD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_13, PF_NONE, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_NONE, 4), 1, X86InstInfo{"PSRAD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_13, PF_NONE, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_NONE, 6), 1, X86InstInfo{"PSLLD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_13, PF_NONE, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_66, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_66, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_66, 2), 1, X86InstInfo{"PSRLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_13, PF_66, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_66, 4), 1, X86InstInfo{"PSRAD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_13, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_66, 6), 1, X86InstInfo{"PSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_13, PF_66, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F3, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_13, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 14 {OPD(TYPE_GROUP_14, PF_NONE, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_NONE, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_NONE, 2), 1, X86InstInfo{"PSRLQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_14, PF_NONE, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_NONE, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_NONE, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_NONE, 6), 1, X86InstInfo{"PSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {OPD(TYPE_GROUP_14, PF_NONE, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_66, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_66, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_66, 2), 1, X86InstInfo{"PSRLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_14, PF_66, 3), 1, X86InstInfo{"PSRLDQ",TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_14, PF_66, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_66, 6), 1, X86InstInfo{"PSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_14, PF_66, 7), 1, X86InstInfo{"PSLLDQ",TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_GROUP_14, PF_F3, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F3, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_14, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 15 {OPD(TYPE_GROUP_15, PF_NONE, 0), 1, X86InstInfo{"FXSAVE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, // MMX/x87 {OPD(TYPE_GROUP_15, PF_NONE, 1), 1, X86InstInfo{"FXRSTOR", TYPE_INST, FLAGS_MODRM, 0}}, // MMX/x87 {OPD(TYPE_GROUP_15, PF_NONE, 2), 1, X86InstInfo{"LDMXCSR", TYPE_INST, GenFlagsSameSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_15, PF_NONE, 3), 1, X86InstInfo{"STMXCSR", TYPE_INST, GenFlagsSameSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_15, PF_NONE, 4), 1, X86InstInfo{"XSAVE", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_NONE, 5), 1, X86InstInfo{"LFENCE/XRSTOR", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_NONE, 6), 1, X86InstInfo{"MFENCE/XSAVEOPT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_NONE, 7), 1, X86InstInfo{"SFENCE/CLFLUSH", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_F3, 0), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = SecondGroup_ArchSelect_LUT[ENTRY_15_F3_0] }}}, {OPD(TYPE_GROUP_15, PF_F3, 1), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = SecondGroup_ArchSelect_LUT[ENTRY_15_F3_1] }}}, {OPD(TYPE_GROUP_15, PF_F3, 2), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = SecondGroup_ArchSelect_LUT[ENTRY_15_F3_2] }}}, {OPD(TYPE_GROUP_15, PF_F3, 3), 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = SecondGroup_ArchSelect_LUT[ENTRY_15_F3_3] }}}, {OPD(TYPE_GROUP_15, PF_F3, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F3, 5), 1, X86InstInfo{"INCSSPQ", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_15, PF_F3, 6), 1, X86InstInfo{"UMONITOR/CLRSSBSY", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_F3, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_66, 6), 1, X86InstInfo{"CLWB", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_66, 7), 1, X86InstInfo{"CLFLUSHOPT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 6), 1, X86InstInfo{"UMWAIT", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPD(TYPE_GROUP_15, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP 16 // AMD documentation claims again that this entire group is n/a to prefix // Tooling once again fails to disassemble oens with the prefix. Disable until proven otherwise {OPD(TYPE_GROUP_16, PF_NONE, 0), 1, X86InstInfo{"PREFETCH NTA", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 1), 1, X86InstInfo{"PREFETCH T0", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 2), 1, X86InstInfo{"PREFETCH T1", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 3), 1, X86InstInfo{"PREFETCH T2", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 4), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 5), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 6), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_NONE, 7), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 0), 1, X86InstInfo{"PREFETCH NTA", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 1), 1, X86InstInfo{"PREFETCH T0", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 2), 1, X86InstInfo{"PREFETCH T1", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 3), 1, X86InstInfo{"PREFETCH T2", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 4), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 5), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 6), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F3, 7), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_66, 0), 1, X86InstInfo{"PREFETCH NTA", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_66, 1), 1, X86InstInfo{"PREFETCH T0", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_66, 2), 1, X86InstInfo{"PREFETCH T1", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_66, 3), 1, X86InstInfo{"PREFETCH T2", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_66, 4), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_66, 5), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_66, 6), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_66, 7), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 0), 1, X86InstInfo{"PREFETCH NTA", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 1), 1, X86InstInfo{"PREFETCH T0", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 2), 1, X86InstInfo{"PREFETCH T1", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 3), 1, X86InstInfo{"PREFETCH T2", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 4), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 5), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 6), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, {OPD(TYPE_GROUP_16, PF_F2, 7), 1, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM, 0}}, // GROUP 17 {OPD(TYPE_GROUP_17, PF_NONE, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_NONE, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F3, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 0), 1, X86InstInfo{"EXTRQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 2}}, {OPD(TYPE_GROUP_17, PF_66, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_66, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 0), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(TYPE_GROUP_17, PF_F2, 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // GROUP P // AMD documentation claims n/a for all instructions in Group P // It also claims that instructions /2, /4, /5, /6, /7 all alias to /0 // It claims that /3 is still Prefetch Mod // Tooling fails to decode past the /2 encoding but runs fine in hardware // Hardware also runs all the prefixes correctly {OPD(TYPE_GROUP_P, PF_NONE, 0), 1, X86InstInfo{"PREFETCH Ex", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 1), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 2), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 3), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 4), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 5), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 6), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_NONE, 7), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 0), 1, X86InstInfo{"PREFETCH Ex", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 1), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 2), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 3), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 4), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 5), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 6), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F3, 7), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 0), 1, X86InstInfo{"PREFETCH Ex", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 1), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 2), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 3), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 4), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 5), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 6), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_66, 7), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 0), 1, X86InstInfo{"PREFETCH Ex", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 1), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 2), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 3), 1, X86InstInfo{"PREFETCH Mod", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 4), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 5), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 6), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_GROUP_P, PF_F2, 7), 1, X86InstInfo{"PREFETCH Res", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY, 0}}, }; #undef OPD GenerateTable(Table.data(), SecondaryExtensionOpTable, std::size(SecondaryExtensionOpTable)); IR::InstallToTable(Table, IR::OpDispatch_SecondaryGroupTables); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/SecondaryModRMTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/SecondaryModRMTables.h" #include namespace FEXCore::X86Tables { using namespace InstFlags; constexpr std::array SecondModRMTableOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct SecondaryModRMExtensionOpTable[] = { // REG /1 {((0 << 3) | 0), 1, X86InstInfo{"MONITOR", TYPE_INST, FLAGS_NONE, 0}}, {((0 << 3) | 1), 1, X86InstInfo{"MWAIT", TYPE_INST, FLAGS_NONE, 0}}, {((0 << 3) | 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((0 << 3) | 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((0 << 3) | 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((0 << 3) | 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((0 << 3) | 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((0 << 3) | 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // REG /2 {((1 << 3) | 0), 1, X86InstInfo{"XGETBV", TYPE_INST, FLAGS_NONE, 0}}, {((1 << 3) | 1), 1, X86InstInfo{"XSETBV", TYPE_PRIV, FLAGS_NONE, 0}}, {((1 << 3) | 2), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((1 << 3) | 3), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((1 << 3) | 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((1 << 3) | 5), 1, X86InstInfo{"XEND", TYPE_INST, FLAGS_NONE, 0}}, {((1 << 3) | 6), 1, X86InstInfo{"XTEST", TYPE_INST, FLAGS_NONE, 0}}, {((1 << 3) | 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // REG /3 {((2 << 3) | 0), 1, X86InstInfo{"VMRUN", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 1), 1, X86InstInfo{"VMMCALL", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 2), 1, X86InstInfo{"VMLOAD", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 3), 1, X86InstInfo{"VMSAVE", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 4), 1, X86InstInfo{"STGI", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 5), 1, X86InstInfo{"CLGI", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 6), 1, X86InstInfo{"SKINIT", TYPE_PRIV, FLAGS_NONE, 0}}, {((2 << 3) | 7), 1, X86InstInfo{"INVLPGA", TYPE_INST, FLAGS_NONE, 0}}, // REG /7 {((3 << 3) | 0), 1, X86InstInfo{"SWAPGS", TYPE_INST, FLAGS_NONE, 0}}, {((3 << 3) | 1), 1, X86InstInfo{"RDTSCP", TYPE_INST, FLAGS_NONE, 0}}, {((3 << 3) | 2), 1, X86InstInfo{"MONITORX", TYPE_PRIV, FLAGS_NONE, 0}}, {((3 << 3) | 3), 1, X86InstInfo{"MWAITX", TYPE_PRIV, FLAGS_NONE, 0}}, {((3 << 3) | 4), 1, X86InstInfo{"CLZERO", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SF_SRC_RAX | FLAGS_DEBUG_MEM_ACCESS, 0}}, {((3 << 3) | 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((3 << 3) | 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {((3 << 3) | 7), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, }; GenerateTable(Table.data(), SecondaryModRMExtensionOpTable, std::size(SecondaryModRMExtensionOpTable)); IR::InstallToTable(Table, IR::OpDispatch_SecondaryModRMTables); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/SecondaryTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/SecondaryTables.h" #include #include namespace FEXCore::X86Tables { using namespace InstFlags; enum Secondary_LUT { ENTRY_05, ENTRY_A0, ENTRY_A1, ENTRY_A8, ENTRY_A9, ENTRY_MAX, }; constexpr std::array Secondary_ArchSelect_LUT = {{ { {"SYSCALL", TYPE_INST, DEFAULT_SYSCALL_FLAGS, 0, { .OpDispatch = &IR::OpDispatchBuilder::NOPOp } }, {"SYSCALL", TYPE_INST, DEFAULT_SYSCALL_FLAGS, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::SyscallOp, true> } }, }, { {"PUSH FS", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX> } }, {"PUSH FS", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX> } }, }, { {"POP FS", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_DEF) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX> } }, {"POP FS", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_64BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_FS_PREFIX> } }, }, { {"PUSH GS", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX> } }, {"PUSH GS", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::PUSHSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX> } }, }, { {"POP GS", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_DEF) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX> } }, {"POP GS", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_64BIT) | FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .OpDispatch = &IR::OpDispatchBuilder::Bind<&IR::OpDispatchBuilder::POPSegmentOp, FEXCore::X86Tables::DecodeFlags::FLAG_GS_PREFIX> } }, }, }}; constexpr std::array SecondBaseOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct TwoByteOpTable[] = { // Instructions {0x00, 1, X86InstInfo{"", TYPE_GROUP_6, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x01, 1, X86InstInfo{"", TYPE_GROUP_7, FLAGS_NO_OVERLAY, 0}}, // These two load segment register data {0x02, 1, X86InstInfo{"LAR", TYPE_UNDEC, FLAGS_NO_OVERLAY, 0}}, {0x03, 1, X86InstInfo{"LSL", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x04, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x05, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_NONE, 0, { .Indirect = Secondary_ArchSelect_LUT[ENTRY_05] }}}, {0x06, 1, X86InstInfo{"CLTS", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x07, 1, X86InstInfo{"SYSRET", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x08, 1, X86InstInfo{"INVD", TYPE_PRIV, FLAGS_NO_OVERLAY, 0}}, {0x09, 1, X86InstInfo{"WBINVD", TYPE_PRIV, FLAGS_NO_OVERLAY, 0}}, {0x0A, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x0B, 1, X86InstInfo{"UD2", TYPE_INST, FLAGS_BLOCK_END | FLAGS_NO_OVERLAY, 0}}, {0x0C, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x0D, 1, X86InstInfo{"", TYPE_GROUP_P, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x0E, 1, X86InstInfo{"FEMMS", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x0F, 1, X86InstInfo{"", TYPE_3DNOW_TABLE, FLAGS_NO_OVERLAY, 0}}, {0x10, 1, X86InstInfo{"MOVUPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x11, 1, X86InstInfo{"MOVUPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x12, 1, X86InstInfo{"MOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x13, 1, X86InstInfo{"MOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x14, 1, X86InstInfo{"UNPCKLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x15, 1, X86InstInfo{"UNPCKHPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x16, 1, X86InstInfo{"MOVLHPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x17, 1, X86InstInfo{"MOVHPS", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x18, 1, X86InstInfo{"", TYPE_GROUP_16, FLAGS_NO_OVERLAY, 0}}, {0x19, 7, X86InstInfo{"NOP", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x20, 2, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_NO_OVERLAY, 0}}, {0x22, 2, X86InstInfo{"MOV", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_NO_OVERLAY, 0}}, {0x24, 4, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x28, 1, X86InstInfo{"MOVAPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x29, 1, X86InstInfo{"MOVAPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x2A, 1, X86InstInfo{"CVTPI2PS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_SRC, 0}}, {0x2B, 1, X86InstInfo{"MOVNTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x2C, 1, X86InstInfo{"CVTTPS2PI", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_DST, 0}}, {0x2D, 1, X86InstInfo{"CVTPS2PI", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_DST, 0}}, {0x2E, 1, X86InstInfo{"UCOMISS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x2F, 1, X86InstInfo{"COMISS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x30, 1, X86InstInfo{"WRMSR", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x31, 1, X86InstInfo{"RDTSC", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x32, 1, X86InstInfo{"RDMSR", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x33, 1, X86InstInfo{"RDPMC", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x34, 1, X86InstInfo{"SYSENTER", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x35, 1, X86InstInfo{"SYSEXIT", TYPE_INST, FLAGS_NO_OVERLAY, 0}}, {0x36, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x37, 1, X86InstInfo{"GETSEC", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x38, 1, X86InstInfo{"", TYPE_0F38_TABLE, FLAGS_NO_OVERLAY, 0}}, {0x39, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x3A, 1, X86InstInfo{"", TYPE_0F3A_TABLE, FLAGS_NO_OVERLAY, 0}}, {0x3B, 3, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0x40, 1, X86InstInfo{"CMOVO", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x41, 1, X86InstInfo{"CMOVNO", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x42, 1, X86InstInfo{"CMOVB", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x43, 1, X86InstInfo{"CMOVNB", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x44, 1, X86InstInfo{"CMOVZ", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x45, 1, X86InstInfo{"CMOVNZ", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x46, 1, X86InstInfo{"CMOVBE", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x47, 1, X86InstInfo{"CMOVNBE", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x48, 1, X86InstInfo{"CMOVS", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x49, 1, X86InstInfo{"CMOVNS", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x4A, 1, X86InstInfo{"CMOVP", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x4B, 1, X86InstInfo{"CMOVNP", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x4C, 1, X86InstInfo{"CMOVL", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x4D, 1, X86InstInfo{"CMOVNL", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x4E, 1, X86InstInfo{"CMOVLE", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x4F, 1, X86InstInfo{"CMOVNLE", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0x50, 1, X86InstInfo{"MOVMSKPS", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0x51, 1, X86InstInfo{"SQRTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x52, 1, X86InstInfo{"RSQRTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x53, 1, X86InstInfo{"RCPPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x54, 1, X86InstInfo{"ANDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x55, 1, X86InstInfo{"ANDNPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x56, 1, X86InstInfo{"ORPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x57, 1, X86InstInfo{"XORPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x58, 1, X86InstInfo{"ADDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x59, 1, X86InstInfo{"MULPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5A, 1, X86InstInfo{"CVTPS2PD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5B, 1, X86InstInfo{"CVTDQ2PS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5C, 1, X86InstInfo{"SUBPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5D, 1, X86InstInfo{"MINPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5E, 1, X86InstInfo{"DIVPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5F, 1, X86InstInfo{"MAXPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x60, 1, X86InstInfo{"PUNPCKLBW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x61, 1, X86InstInfo{"PUNPCKLWD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x62, 1, X86InstInfo{"PUNPCKLDQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x63, 1, X86InstInfo{"PACKSSWB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x64, 1, X86InstInfo{"PCMPGTB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x65, 1, X86InstInfo{"PCMPGTW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x66, 1, X86InstInfo{"PCMPGTD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x67, 1, X86InstInfo{"PACKUSWB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x68, 1, X86InstInfo{"PUNPCKHBW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x69, 1, X86InstInfo{"PUNPCKHWD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x6A, 1, X86InstInfo{"PUNPCKHDQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x6B, 1, X86InstInfo{"PACKSSDW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x6C, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x6E, 1, X86InstInfo{"MOVD", TYPE_INST, GenFlagsDstSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_SRC_GPR | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x6F, 1, X86InstInfo{"MOVQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x70, 1, X86InstInfo{"PSHUFW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {0x71, 1, X86InstInfo{"", TYPE_GROUP_12, FLAGS_NO_OVERLAY, 0}}, {0x72, 1, X86InstInfo{"", TYPE_GROUP_13, FLAGS_NO_OVERLAY, 0}}, {0x73, 1, X86InstInfo{"", TYPE_GROUP_14, FLAGS_NO_OVERLAY, 0}}, {0x74, 1, X86InstInfo{"PCMPEQB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x75, 1, X86InstInfo{"PCMPEQW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x76, 1, X86InstInfo{"PCMPEQD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x77, 1, X86InstInfo{"EMMS", TYPE_INST, FLAGS_NONE, 0}}, {0x78, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x7E, 1, X86InstInfo{"MOVD", TYPE_INST, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x7F, 1, X86InstInfo{"MOVQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0x80, 1, X86InstInfo{"JO", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x81, 1, X86InstInfo{"JNO", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x82, 1, X86InstInfo{"JB", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x83, 1, X86InstInfo{"JNB", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x84, 1, X86InstInfo{"JZ", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x85, 1, X86InstInfo{"JNZ", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x86, 1, X86InstInfo{"JBE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x87, 1, X86InstInfo{"JNBE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x88, 1, X86InstInfo{"JS", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x89, 1, X86InstInfo{"JNS", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x8A, 1, X86InstInfo{"JP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x8B, 1, X86InstInfo{"JNP", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x8C, 1, X86InstInfo{"JL", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x8D, 1, X86InstInfo{"JNL", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x8E, 1, X86InstInfo{"JLE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x8F, 1, X86InstInfo{"JNLE", TYPE_INST, GenFlagsSameSize(SIZE_64BITDEF) | FLAGS_SETS_RIP | FLAGS_SRC_SEXT | FLAGS_DISPLACE_SIZE_DIV_2 | FLAGS_NO_OVERLAY, 4}}, {0x90, 1, X86InstInfo{"SETO", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x91, 1, X86InstInfo{"SETNO", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x92, 1, X86InstInfo{"SETB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x93, 1, X86InstInfo{"SETNB", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x94, 1, X86InstInfo{"SETZ", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x95, 1, X86InstInfo{"SETNZ", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x96, 1, X86InstInfo{"SETBE", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x97, 1, X86InstInfo{"SETNBE", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x98, 1, X86InstInfo{"SETS", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x99, 1, X86InstInfo{"SETNS", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x9A, 1, X86InstInfo{"SETP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x9B, 1, X86InstInfo{"SETNP", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x9C, 1, X86InstInfo{"SETL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x9D, 1, X86InstInfo{"SETNL", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x9E, 1, X86InstInfo{"SETLE", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0x9F, 1, X86InstInfo{"SETNLE", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xA0, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .Indirect = Secondary_ArchSelect_LUT[ENTRY_A0] }}}, {0xA1, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .Indirect = Secondary_ArchSelect_LUT[ENTRY_A1] }}}, {0xA2, 1, X86InstInfo{"CPUID", TYPE_INST, FLAGS_SF_SRC_RAX | FLAGS_NO_OVERLAY, 0}}, {0xA3, 1, X86InstInfo{"BT", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xA4, 1, X86InstInfo{"SHLD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 1}}, {0xA5, 1, X86InstInfo{"SHLD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX | FLAGS_NO_OVERLAY, 0}}, {0xA6, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0xA8, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .Indirect = Secondary_ArchSelect_LUT[ENTRY_A8] }}}, {0xA9, 1, X86InstInfo{"", TYPE_ARCH_DISPATCHER, FLAGS_DEBUG_MEM_ACCESS | FLAGS_NO_OVERLAY, 0, { .Indirect = Secondary_ArchSelect_LUT[ENTRY_A9] }}}, {0xAA, 1, X86InstInfo{"RSM", TYPE_PRIV, FLAGS_NO_OVERLAY, 0}}, {0xAB, 1, X86InstInfo{"BTS", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xAC, 1, X86InstInfo{"SHRD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 1}}, {0xAD, 1, X86InstInfo{"SHRD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_SRC_RCX | FLAGS_NO_OVERLAY, 0}}, {0xAE, 1, X86InstInfo{"", TYPE_GROUP_15, FLAGS_NO_OVERLAY, 0}}, {0xAF, 1, X86InstInfo{"IMUL", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0xB0, 1, X86InstInfo{"CMPXCHG", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xB1, 1, X86InstInfo{"CMPXCHG", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xB2, 1, X86InstInfo{"LSS", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0xB3, 1, X86InstInfo{"BTR", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xB4, 1, X86InstInfo{"LFS", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0xB5, 1, X86InstInfo{"LGS", TYPE_INVALID, FLAGS_NO_OVERLAY, 0}}, {0xB6, 1, X86InstInfo{"MOVZX", TYPE_INST, GenFlagsSrcSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0xB7, 1, X86InstInfo{"MOVZX", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0xB8, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xB9, 1, X86InstInfo{"", TYPE_GROUP_10, FLAGS_NO_OVERLAY, 0}}, {0xBA, 1, X86InstInfo{"", TYPE_GROUP_8, FLAGS_NO_OVERLAY, 0}}, {0xBB, 1, X86InstInfo{"BTC", TYPE_INST, FLAGS_DEBUG_MEM_ACCESS | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xBC, 1, X86InstInfo{"BSF", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY66, 0}}, {0xBD, 1, X86InstInfo{"BSR", TYPE_INST, FLAGS_MODRM | FLAGS_NO_OVERLAY66, 0}}, {0xBE, 1, X86InstInfo{"MOVSX", TYPE_INST, GenFlagsSrcSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0xBF, 1, X86InstInfo{"MOVSX", TYPE_INST, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_NO_OVERLAY, 0}}, {0xC0, 1, X86InstInfo{"XADD", TYPE_INST, GenFlagsSameSize(SIZE_8BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {0xC1, 1, X86InstInfo{"XADD", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_NO_OVERLAY, 0}}, {0xC2, 1, X86InstInfo{"CMPPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0xC3, 1, X86InstInfo{"MOVNTI", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST, 0}}, {0xC4, 1, X86InstInfo{"PINSRW", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX | FLAGS_SF_SRC_GPR, 1}}, {0xC5, 1, X86InstInfo{"PEXTRW", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 1}}, {0xC6, 1, X86InstInfo{"SHUFPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0xC7, 1, X86InstInfo{"", TYPE_GROUP_9, FLAGS_NO_OVERLAY, 0}}, {0xC8, 8, X86InstInfo{"BSWAP", TYPE_INST, FLAGS_SF_REX_IN_BYTE | FLAGS_NO_OVERLAY, 0}}, {0xD0, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xD1, 1, X86InstInfo{"PSRLW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xD2, 1, X86InstInfo{"PSRLD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xD3, 1, X86InstInfo{"PSRLQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xD4, 1, X86InstInfo{"PADDQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xD5, 1, X86InstInfo{"PMULLW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xD6, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xD7, 1, X86InstInfo{"PMOVMSKB", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR | FLAGS_SF_MMX_SRC, 0}}, {0xD8, 1, X86InstInfo{"PSUBUSB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xD9, 1, X86InstInfo{"PSUBUSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xDA, 1, X86InstInfo{"PMINUB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xDB, 1, X86InstInfo{"PAND", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xDC, 1, X86InstInfo{"PADDUSB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xDD, 1, X86InstInfo{"PADDUSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xDE, 1, X86InstInfo{"PMAXUB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xDF, 1, X86InstInfo{"PANDN", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE0, 1, X86InstInfo{"PAVGB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE1, 1, X86InstInfo{"PSRAW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE2, 1, X86InstInfo{"PSRAD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE3, 1, X86InstInfo{"PAVGW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE4, 1, X86InstInfo{"PMULHUW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE5, 1, X86InstInfo{"PMULHW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE6, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE7, 1, X86InstInfo{"MOVNTQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE8, 1, X86InstInfo{"PSUBSB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xE9, 1, X86InstInfo{"PSUBSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xEA, 1, X86InstInfo{"PMINSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xEB, 1, X86InstInfo{"POR", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xEC, 1, X86InstInfo{"PADDSB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xED, 1, X86InstInfo{"PADDSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xEE, 1, X86InstInfo{"PMAXSW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xEF, 1, X86InstInfo{"PXOR", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF0, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xF1, 1, X86InstInfo{"PSLLW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF2, 1, X86InstInfo{"PSLLD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF3, 1, X86InstInfo{"PSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF4, 1, X86InstInfo{"PMULUDQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF5, 1, X86InstInfo{"PMADDWD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF6, 1, X86InstInfo{"PSADBW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF7, 1, X86InstInfo{"MASKMOVQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF8, 1, X86InstInfo{"PSUBB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xF9, 1, X86InstInfo{"PSUBW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xFA, 1, X86InstInfo{"PSUBD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xFB, 1, X86InstInfo{"PSUBQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xFC, 1, X86InstInfo{"PADDB", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xFD, 1, X86InstInfo{"PADDW", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xFE, 1, X86InstInfo{"PADDD", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX, 0}}, {0xFF, 1, X86InstInfo{"UD0", TYPE_INST, FLAGS_BLOCK_END, 0}}, #ifndef _WIN32 // FEX reserved instructions // Unused x86 encoding instruction. {0x3E, 1, X86InstInfo{"CALLBACKRET", TYPE_INST, FLAGS_BLOCK_END | FLAGS_NO_OVERLAY | FLAGS_SETS_RIP, 0}}, // This was originally used by VIA to jump to its alternative instruction set. Used for OP_THUNK {0x3F, 1, X86InstInfo{"ALTINST", TYPE_INST, FLAGS_BLOCK_END | FLAGS_NO_OVERLAY | FLAGS_SETS_RIP, 0}}, #endif }; GenerateTable(Table.data(), TwoByteOpTable, std::size(TwoByteOpTable)); IR::InstallToTable(Table, IR::OpDispatch_TwoByteOpTable); return Table; }(); constexpr std::array RepModOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct RepModOpTable[] = { {0x0, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x10, 1, X86InstInfo{"MOVSS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x11, 1, X86InstInfo{"MOVSS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x12, 1, X86InstInfo{"MOVSLDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x13, 3, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x16, 1, X86InstInfo{"MOVSHDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x17, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x19, 7, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x20, 4, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x24, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x2A, 1, X86InstInfo{"CVTSI2SS", TYPE_INST, GenFlagsDstSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 0}}, {0x2B, 1, X86InstInfo{"MOVNTSS", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x2C, 1, X86InstInfo{"CVTTSS2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0x2D, 1, X86InstInfo{"CVTSS2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0x2E, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x30, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x40, 16, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x50, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x51, 1, X86InstInfo{"SQRTSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x52, 1, X86InstInfo{"RSQRTSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x53, 1, X86InstInfo{"RCPSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x54, 4, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x58, 1, X86InstInfo{"ADDSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x59, 1, X86InstInfo{"MULSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5A, 1, X86InstInfo{"CVTSS2SD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5B, 1, X86InstInfo{"CVTTPS2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5C, 1, X86InstInfo{"SUBSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5D, 1, X86InstInfo{"MINSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5E, 1, X86InstInfo{"DIVSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5F, 1, X86InstInfo{"MAXSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x60, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x68, 7, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x6F, 1, X86InstInfo{"MOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x70, 1, X86InstInfo{"PSHUFHW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0x71, 3, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x74, 4, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x78, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x7E, 1, X86InstInfo{"MOVQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x7F, 1, X86InstInfo{"MOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x80, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x90, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xA0, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xB0, 8, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xB8, 1, X86InstInfo{"POPCNT", TYPE_INST, FLAGS_MODRM, 0}}, {0xB9, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xBA, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xBB, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xBC, 1, X86InstInfo{"TZCNT", TYPE_INST, FLAGS_MODRM, 0}}, {0xBD, 1, X86InstInfo{"LZCNT", TYPE_INST, FLAGS_MODRM, 0}}, {0xBE, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC0, 2, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xC2, 1, X86InstInfo{"CMPSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0xC3, 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC8, 8, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xD0, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xD6, 1, X86InstInfo{"MOVQ2DQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_SRC, 0}}, {0xD7, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xD8, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE0, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE6, 1, X86InstInfo{"CVTDQ2PD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE7, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE8, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xF0, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xF8, 7, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xFF, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, }; GenerateTableWithCopy(Table.data(), RepModOpTable, std::size(RepModOpTable), SecondBaseOps.data()); IR::InstallToTable(Table, IR::OpDispatch_SecondaryRepModTables); return Table; }(); constexpr std::array RepNEModOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct RepNEModOpTable[] = { {0x0, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x10, 1, X86InstInfo{"MOVSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x11, 1, X86InstInfo{"MOVSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x12, 1, X86InstInfo{"MOVDDUP", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x13, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x19, 7, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x20, 4, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x24, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x2A, 1, X86InstInfo{"CVTSI2SD", TYPE_INST, GenFlagsDstSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 0}}, {0x2B, 1, X86InstInfo{"MOVNTSD", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x2C, 1, X86InstInfo{"CVTTSD2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0x2D, 1, X86InstInfo{"CVTSD2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0x2E, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x30, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x40, 16, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x50, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x51, 1, X86InstInfo{"SQRTSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x52, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x58, 1, X86InstInfo{"ADDSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x59, 1, X86InstInfo{"MULSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5A, 1, X86InstInfo{"CVTSD2SS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5B, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x5C, 1, X86InstInfo{"SUBSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5D, 1, X86InstInfo{"MINSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5E, 1, X86InstInfo{"DIVSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5F, 1, X86InstInfo{"MAXSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x60, 16, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x70, 1, X86InstInfo{"PSHUFLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0x71, 3, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x74, 4, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x78, 1, X86InstInfo{"INSERTQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS,2}}, {0x79, 1, X86InstInfo{"INSERTQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 0}}, {0x7A, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x7C, 1, X86InstInfo{"HADDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x7D, 1, X86InstInfo{"HSUBPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x7E, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x80, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x90, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xA0, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xB0, 8, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xB8, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xB9, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xBA, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xBB, 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC0, 2, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xC2, 1, X86InstInfo{"CMPSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0xC3, 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC8, 8, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xD0, 1, X86InstInfo{"ADDSUBPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD1, 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xD6, 1, X86InstInfo{"MOVDQ2Q", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_DST, 0}}, {0xD7, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xD8, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE0, 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE6, 1, X86InstInfo{"CVTPD2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE7, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xE8, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xF0, 1, X86InstInfo{"LDDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS,0}}, {0xF1, 7, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xF8, 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, }; GenerateTableWithCopy(Table.data(), RepNEModOpTable, std::size(RepNEModOpTable), SecondBaseOps.data()); IR::InstallToTable(Table, IR::OpDispatch_SecondaryRepNEModTables); return Table; }(); constexpr std::array OpSizeModOps = []() consteval { std::array Table{}; constexpr U8U8InfoStruct OpSizeModOpTable[] = { {0x0, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x10, 1, X86InstInfo{"MOVUPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x11, 1, X86InstInfo{"MOVUPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x12, 1, X86InstInfo{"MOVLPD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS, 0}}, {0x13, 1, X86InstInfo{"MOVLPD", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x14, 1, X86InstInfo{"UNPCKLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x15, 1, X86InstInfo{"UNPCKHPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x16, 1, X86InstInfo{"MOVHPD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS, 0}}, {0x17, 1, X86InstInfo{"MOVHPD", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x18, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x19, 7, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x20, 4, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x24, 4, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x28, 1, X86InstInfo{"MOVAPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x29, 1, X86InstInfo{"MOVAPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x2A, 1, X86InstInfo{"CVTPI2PD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_SRC, 0}}, {0x2B, 1, X86InstInfo{"MOVNTPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x2C, 1, X86InstInfo{"CVTTPD2PI", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_DST, 0}}, {0x2D, 1, X86InstInfo{"CVTPD2PI", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MMX_DST, 0}}, {0x2E, 1, X86InstInfo{"UCOMISD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x2F, 1, X86InstInfo{"COMISD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x30, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x40, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x50, 1, X86InstInfo{"MOVMSKPD", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0x51, 1, X86InstInfo{"SQRTPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x52, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x54, 1, X86InstInfo{"ANDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x55, 1, X86InstInfo{"ANDNPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x56, 1, X86InstInfo{"ORPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x57, 1, X86InstInfo{"XORPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x58, 1, X86InstInfo{"ADDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x59, 1, X86InstInfo{"MULPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5A, 1, X86InstInfo{"CVTPD2PS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5B, 1, X86InstInfo{"CVTPS2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5C, 1, X86InstInfo{"SUBPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5D, 1, X86InstInfo{"MINPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5E, 1, X86InstInfo{"DIVPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x5F, 1, X86InstInfo{"MAXPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x60, 1, X86InstInfo{"PUNPCKLBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x61, 1, X86InstInfo{"PUNPCKLWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x62, 1, X86InstInfo{"PUNPCKLDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x63, 1, X86InstInfo{"PACKSSWB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x64, 1, X86InstInfo{"PCMPGTB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x65, 1, X86InstInfo{"PCMPGTW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x66, 1, X86InstInfo{"PCMPGTD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x67, 1, X86InstInfo{"PACKUSWB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x68, 1, X86InstInfo{"PUNPCKHBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x69, 1, X86InstInfo{"PUNPCKHWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x6A, 1, X86InstInfo{"PUNPCKHDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x6B, 1, X86InstInfo{"PACKSSDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x6C, 1, X86InstInfo{"PUNPCKLQDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x6D, 1, X86InstInfo{"PUNPCKHQDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x6E, 1, X86InstInfo{"MOVD", TYPE_INST, GenFlagsDstSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_SRC_GPR | FLAGS_XMM_FLAGS, 0}}, {0x6F, 1, X86InstInfo{"MOVDQA", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x70, 1, X86InstInfo{"PSHUFD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0x71, 3, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x74, 1, X86InstInfo{"PCMPEQB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x75, 1, X86InstInfo{"PCMPEQW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x76, 1, X86InstInfo{"PCMPEQD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x77, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x78, 1, X86InstInfo{"", TYPE_GROUP_17, FLAGS_NONE, 0}}, {0x79, 1, X86InstInfo{"EXTRQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 0}}, {0x7A, 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0x7C, 1, X86InstInfo{"HADDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x7D, 1, X86InstInfo{"HSUBPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0x7E, 1, X86InstInfo{"MOVD", TYPE_INST, GenFlagsSrcSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 0}}, {0x7F, 1, X86InstInfo{"MOVDQA", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0x80, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0x90, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xA0, 16, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xB0, 8, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xB8, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xB9, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xBA, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xBB, 5, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC0, 2, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xC2, 1, X86InstInfo{"CMPPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0xC3, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC4, 1, X86InstInfo{"PINSRW", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_SRC_GPR | FLAGS_XMM_FLAGS, 1}}, {0xC5, 1, X86InstInfo{"PEXTRW", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {0xC6, 1, X86InstInfo{"SHUFPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {0xC7, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xC8, 8, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, {0xD0, 1, X86InstInfo{"ADDSUBPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD1, 1, X86InstInfo{"PSRLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD2, 1, X86InstInfo{"PSRLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD3, 1, X86InstInfo{"PSRLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD4, 1, X86InstInfo{"PADDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD5, 1, X86InstInfo{"PMULLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD6, 1, X86InstInfo{"MOVQ", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0xD7, 1, X86InstInfo{"PMOVMSKB", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {0xD8, 1, X86InstInfo{"PSUBUSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xD9, 1, X86InstInfo{"PSUBUSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xDA, 1, X86InstInfo{"PMINUB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xDB, 1, X86InstInfo{"PAND", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xDC, 1, X86InstInfo{"PADDUSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xDD, 1, X86InstInfo{"PADDUSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xDE, 1, X86InstInfo{"PMAXUB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xDF, 1, X86InstInfo{"PANDN", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE0, 1, X86InstInfo{"PAVGB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE1, 1, X86InstInfo{"PSRAW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE2, 1, X86InstInfo{"PSRAD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE3, 1, X86InstInfo{"PAVGW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE4, 1, X86InstInfo{"PMULHUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE5, 1, X86InstInfo{"PMULHW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE6, 1, X86InstInfo{"CVTTPD2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE7, 1, X86InstInfo{"MOVNTDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {0xE8, 1, X86InstInfo{"PSUBSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xE9, 1, X86InstInfo{"PSUBSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xEA, 1, X86InstInfo{"PMINSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xEB, 1, X86InstInfo{"POR", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xEC, 1, X86InstInfo{"PADDSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xED, 1, X86InstInfo{"PADDSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xEE, 1, X86InstInfo{"PMAXSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xEF, 1, X86InstInfo{"PXOR", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF0, 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {0xF1, 1, X86InstInfo{"PSLLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF2, 1, X86InstInfo{"PSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF3, 1, X86InstInfo{"PSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF4, 1, X86InstInfo{"PMULUDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF5, 1, X86InstInfo{"PMADDWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF6, 1, X86InstInfo{"PSADBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF7, 1, X86InstInfo{"MASKMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS, 0}}, {0xF8, 1, X86InstInfo{"PSUBB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xF9, 1, X86InstInfo{"PSUBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xFA, 1, X86InstInfo{"PSUBD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xFB, 1, X86InstInfo{"PSUBQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xFC, 1, X86InstInfo{"PADDB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xFD, 1, X86InstInfo{"PADDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xFE, 1, X86InstInfo{"PADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {0xFF, 1, X86InstInfo{"", TYPE_COPY_OTHER, FLAGS_NONE, 0}}, }; GenerateTableWithCopy(Table.data(), OpSizeModOpTable, std::size(OpSizeModOpTable), SecondBaseOps.data()); IR::InstallToTable(Table, IR::OpDispatch_SecondaryOpSizeModTables); return Table; }(); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/VEXTables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher/VEXTables.h" #include namespace FEXCore::X86Tables { using namespace InstFlags; namespace AVX128 { using namespace IR; #define OPD(map_select, pp, opcode) (((map_select - 1) << 10) | (pp << 8) | (opcode)) constexpr DispatchTableEntry BaseTable[] = { {OPD(1, 0b00, 0x10), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b01, 0x10), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x10), 1, &OpDispatchBuilder::AVX128_VMOVSS}, {OPD(1, 0b11, 0x10), 1, &OpDispatchBuilder::AVX128_VMOVSD}, {OPD(1, 0b00, 0x11), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b01, 0x11), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x11), 1, &OpDispatchBuilder::AVX128_VMOVSS}, {OPD(1, 0b11, 0x11), 1, &OpDispatchBuilder::AVX128_VMOVSD}, {OPD(1, 0b00, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVLP}, {OPD(1, 0b01, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVLP}, {OPD(1, 0b10, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVSLDUP}, {OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::AVX128_VMOVDDUP}, {OPD(1, 0b00, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP}, {OPD(1, 0b01, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP}, {OPD(1, 0b00, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKL, OpSize::i32Bit>}, {OPD(1, 0b01, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKL, OpSize::i64Bit>}, {OPD(1, 0b00, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKH, OpSize::i32Bit>}, {OPD(1, 0b01, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKH, OpSize::i64Bit>}, {OPD(1, 0b00, 0x16), 1, &OpDispatchBuilder::AVX128_VMOVHP}, {OPD(1, 0b01, 0x16), 1, &OpDispatchBuilder::AVX128_VMOVHP}, {OPD(1, 0b10, 0x16), 1, &OpDispatchBuilder::AVX128_VMOVSHDUP}, {OPD(1, 0b00, 0x17), 1, &OpDispatchBuilder::AVX128_VMOVHP}, {OPD(1, 0b01, 0x17), 1, &OpDispatchBuilder::AVX128_VMOVHP}, {OPD(1, 0b00, 0x28), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b01, 0x28), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b00, 0x29), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b01, 0x29), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x2A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR, OpSize::i32Bit>}, {OPD(1, 0b11, 0x2A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR, OpSize::i64Bit>}, {OPD(1, 0b00, 0x2B), 1, &OpDispatchBuilder::AVX128_MOVVectorNT}, {OPD(1, 0b01, 0x2B), 1, &OpDispatchBuilder::AVX128_MOVVectorNT}, {OPD(1, 0b10, 0x2C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_CVTFPR_To_GPR, OpSize::i32Bit, false>}, {OPD(1, 0b11, 0x2C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_CVTFPR_To_GPR, OpSize::i64Bit, false>}, {OPD(1, 0b10, 0x2D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_CVTFPR_To_GPR, OpSize::i32Bit, true>}, {OPD(1, 0b11, 0x2D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_CVTFPR_To_GPR, OpSize::i64Bit, true>}, {OPD(1, 0b00, 0x2E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_UCOMISx, OpSize::i32Bit>}, {OPD(1, 0b01, 0x2E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_UCOMISx, OpSize::i64Bit>}, {OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_UCOMISx, OpSize::i32Bit>}, {OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_UCOMISx, OpSize::i64Bit>}, {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_MOVMSK, OpSize::i32Bit>}, {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_MOVMSK, OpSize::i64Bit>}, {OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i32Bit>}, {OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFSQRT, OpSize::i64Bit>}, {OPD(1, 0b10, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFSQRTSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFSQRTSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b00, 0x52), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFRSQRT, OpSize::i32Bit>}, {OPD(1, 0b10, 0x52), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFRSQRTSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b00, 0x53), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VFRECP, OpSize::i32Bit>}, {OPD(1, 0b10, 0x53), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFRECPSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b00, 0x54), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VAND, OpSize::i128Bit>}, {OPD(1, 0b01, 0x54), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VAND, OpSize::i128Bit>}, {OPD(1, 0b00, 0x55), 1, &OpDispatchBuilder::AVX128_VANDN}, {OPD(1, 0b01, 0x55), 1, &OpDispatchBuilder::AVX128_VANDN}, {OPD(1, 0b00, 0x56), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VOR, OpSize::i128Bit>}, {OPD(1, 0b01, 0x56), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VOR, OpSize::i128Bit>}, {OPD(1, 0b00, 0x57), 1, &OpDispatchBuilder::AVX128_VectorXOR}, {OPD(1, 0b01, 0x57), 1, &OpDispatchBuilder::AVX128_VectorXOR}, {OPD(1, 0b00, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFADD, OpSize::i32Bit>}, {OPD(1, 0b01, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFADD, OpSize::i64Bit>}, {OPD(1, 0b10, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFADDSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFADDSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b00, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFMUL, OpSize::i32Bit>}, {OPD(1, 0b01, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFMUL, OpSize::i64Bit>}, {OPD(1, 0b10, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFMULSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFMULSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b00, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float, OpSize::i64Bit, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float, OpSize::i32Bit, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertScalar_CVT_Float_To_Float, OpSize::i64Bit, OpSize::i32Bit>}, {OPD(1, 0b11, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertScalar_CVT_Float_To_Float, OpSize::i32Bit, OpSize::i64Bit>}, {OPD(1, 0b00, 0x5B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float, OpSize::i32Bit, false>}, {OPD(1, 0b01, 0x5B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int, OpSize::i32Bit, true>}, {OPD(1, 0b10, 0x5B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int, OpSize::i32Bit, false>}, {OPD(1, 0b00, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFSUB, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFSUB, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFSUBSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFSUBSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b00, 0x5D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFMIN, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFMIN, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFMINSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x5D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFMINSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b00, 0x5E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFDIV, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFDIV, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFDIVSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x5E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFDIVSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b00, 0x5F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFMAX, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFMAX, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFMAXSCALARINSERT, OpSize::i32Bit>}, {OPD(1, 0b11, 0x5F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorScalarInsertALU, IR::OP_VFMAXSCALARINSERT, OpSize::i64Bit>}, {OPD(1, 0b01, 0x60), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKL, OpSize::i8Bit>}, {OPD(1, 0b01, 0x61), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKL, OpSize::i16Bit>}, {OPD(1, 0b01, 0x62), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKL, OpSize::i32Bit>}, {OPD(1, 0b01, 0x63), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPACKSS, OpSize::i16Bit>}, {OPD(1, 0b01, 0x64), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPGT, OpSize::i8Bit>}, {OPD(1, 0b01, 0x65), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPGT, OpSize::i16Bit>}, {OPD(1, 0b01, 0x66), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPGT, OpSize::i32Bit>}, {OPD(1, 0b01, 0x67), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPACKUS, OpSize::i16Bit>}, {OPD(1, 0b01, 0x68), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKH, OpSize::i8Bit>}, {OPD(1, 0b01, 0x69), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKH, OpSize::i16Bit>}, {OPD(1, 0b01, 0x6A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKH, OpSize::i32Bit>}, {OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPACKSS, OpSize::i32Bit>}, {OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKL, OpSize::i64Bit>}, {OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPUNPCKH, OpSize::i64Bit>}, {OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR}, {OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPERMILImm, OpSize::i32Bit>}, {OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, false>}, {OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSHUFW, true>}, {OPD(1, 0b01, 0x74), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPEQ, OpSize::i8Bit>}, {OPD(1, 0b01, 0x75), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPEQ, OpSize::i16Bit>}, {OPD(1, 0b01, 0x76), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPEQ, OpSize::i32Bit>}, {OPD(1, 0b00, 0x77), 1, &OpDispatchBuilder::AVX128_VZERO}, {OPD(1, 0b01, 0x7C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VHADDP, IR::OP_VFADDP, OpSize::i64Bit>}, {OPD(1, 0b11, 0x7C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VHADDP, IR::OP_VFADDP, OpSize::i32Bit>}, {OPD(1, 0b01, 0x7D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VHSUBP, OpSize::i64Bit>}, {OPD(1, 0b11, 0x7D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VHSUBP, OpSize::i32Bit>}, {OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::AVX128_MOVBetweenGPR_FPR}, {OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::AVX128_MOVQ}, {OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::AVX128_VMOVAPS}, {OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFCMP, OpSize::i32Bit>}, {OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFCMP, OpSize::i64Bit>}, {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertScalarFCMP, OpSize::i32Bit>}, {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertScalarFCMP, OpSize::i64Bit>}, {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::AVX128_VPINSRW}, {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_PExtr, OpSize::i16Bit>}, {OPD(1, 0b00, 0xC6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VSHUF, OpSize::i32Bit>}, {OPD(1, 0b01, 0xC6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VSHUF, OpSize::i64Bit>}, {OPD(1, 0b01, 0xD0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VADDSUBP, OpSize::i64Bit>}, {OPD(1, 0b11, 0xD0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VADDSUBP, OpSize::i32Bit>}, {OPD(1, 0b01, 0xD1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i16Bit, IROps::OP_VUSHRSWIDE>}, // VPSRL {OPD(1, 0b01, 0xD2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i32Bit, IROps::OP_VUSHRSWIDE>}, // VPSRL {OPD(1, 0b01, 0xD3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i64Bit, IROps::OP_VUSHRSWIDE>}, // VPSRL {OPD(1, 0b01, 0xD4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VADD, OpSize::i64Bit>}, {OPD(1, 0b01, 0xD5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VMUL, OpSize::i16Bit>}, {OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::AVX128_MOVQ}, {OPD(1, 0b01, 0xD7), 1, &OpDispatchBuilder::AVX128_MOVMSKB}, {OPD(1, 0b01, 0xD8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUQSUB, OpSize::i8Bit>}, {OPD(1, 0b01, 0xD9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUQSUB, OpSize::i16Bit>}, {OPD(1, 0b01, 0xDA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUMIN, OpSize::i8Bit>}, {OPD(1, 0b01, 0xDB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VAND, OpSize::i128Bit>}, {OPD(1, 0b01, 0xDC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUQADD, OpSize::i8Bit>}, {OPD(1, 0b01, 0xDD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUQADD, OpSize::i16Bit>}, {OPD(1, 0b01, 0xDE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUMAX, OpSize::i8Bit>}, {OPD(1, 0b01, 0xDF), 1, &OpDispatchBuilder::AVX128_VANDN}, {OPD(1, 0b01, 0xE0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VURAVG, OpSize::i8Bit>}, {OPD(1, 0b01, 0xE1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i16Bit, IROps::OP_VSSHRSWIDE>}, // VPSRA {OPD(1, 0b01, 0xE2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i32Bit, IROps::OP_VSSHRSWIDE>}, // VPSRA {OPD(1, 0b01, 0xE3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VURAVG, OpSize::i16Bit>}, {OPD(1, 0b01, 0xE4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPMULHW, false>}, {OPD(1, 0b01, 0xE5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPMULHW, true>}, {OPD(1, 0b01, 0xE6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int, OpSize::i64Bit, false>}, {OPD(1, 0b10, 0xE6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float, OpSize::i32Bit, true>}, {OPD(1, 0b11, 0xE6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int, OpSize::i64Bit, true>}, {OPD(1, 0b01, 0xE7), 1, &OpDispatchBuilder::AVX128_MOVVectorNT}, {OPD(1, 0b01, 0xE8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSQSUB, OpSize::i8Bit>}, {OPD(1, 0b01, 0xE9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSQSUB, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSMIN, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VOR, OpSize::i128Bit>}, {OPD(1, 0b01, 0xEC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSQADD, OpSize::i8Bit>}, {OPD(1, 0b01, 0xED), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSQADD, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSMAX, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEF), 1, &OpDispatchBuilder::AVX128_VectorXOR}, {OPD(1, 0b11, 0xF0), 1, &OpDispatchBuilder::AVX128_MOVVectorUnaligned}, {OPD(1, 0b01, 0xF1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i16Bit, IROps::OP_VUSHLSWIDE>}, // VPSLL {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i32Bit, IROps::OP_VUSHLSWIDE>}, // VPSLL {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftWideImpl, OpSize::i64Bit, IROps::OP_VUSHLSWIDE>}, // VPSLL {OPD(1, 0b01, 0xF4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPMULL, OpSize::i32Bit, false>}, {OPD(1, 0b01, 0xF5), 1, &OpDispatchBuilder::AVX128_VPMADDWD}, {OPD(1, 0b01, 0xF6), 1, &OpDispatchBuilder::AVX128_VPSADBW}, {OPD(1, 0b01, 0xF7), 1, &OpDispatchBuilder::AVX128_MASKMOV}, {OPD(1, 0b01, 0xF8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSUB, OpSize::i8Bit>}, {OPD(1, 0b01, 0xF9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSUB, OpSize::i16Bit>}, {OPD(1, 0b01, 0xFA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSUB, OpSize::i32Bit>}, {OPD(1, 0b01, 0xFB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSUB, OpSize::i64Bit>}, {OPD(1, 0b01, 0xFC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VADD, OpSize::i8Bit>}, {OPD(1, 0b01, 0xFD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VADD, OpSize::i16Bit>}, {OPD(1, 0b01, 0xFE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VADD, OpSize::i32Bit>}, {OPD(2, 0b01, 0x00), 1, &OpDispatchBuilder::AVX128_VPSHUFB}, {OPD(2, 0b01, 0x01), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VHADDP, IR::OP_VADDP, OpSize::i16Bit>}, {OPD(2, 0b01, 0x02), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VHADDP, IR::OP_VADDP, OpSize::i32Bit>}, {OPD(2, 0b01, 0x03), 1, &OpDispatchBuilder::AVX128_VPHADDSW}, {OPD(2, 0b01, 0x04), 1, &OpDispatchBuilder::AVX128_VPMADDUBSW}, {OPD(2, 0b01, 0x05), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPHSUB, OpSize::i16Bit>}, {OPD(2, 0b01, 0x06), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPHSUB, OpSize::i32Bit>}, {OPD(2, 0b01, 0x07), 1, &OpDispatchBuilder::AVX128_VPHSUBSW}, {OPD(2, 0b01, 0x08), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSIGN, OpSize::i8Bit>}, {OPD(2, 0b01, 0x09), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSIGN, OpSize::i16Bit>}, {OPD(2, 0b01, 0x0A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPSIGN, OpSize::i32Bit>}, {OPD(2, 0b01, 0x0B), 1, &OpDispatchBuilder::AVX128_VPMULHRSW}, {OPD(2, 0b01, 0x0C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPERMILReg, OpSize::i32Bit>}, {OPD(2, 0b01, 0x0D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPERMILReg, OpSize::i64Bit>}, {OPD(2, 0b01, 0x0E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VTESTP, OpSize::i32Bit>}, {OPD(2, 0b01, 0x0F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VTESTP, OpSize::i64Bit>}, {OPD(2, 0b01, 0x13), 1, &OpDispatchBuilder::AVX128_VCVTPH2PS}, {OPD(2, 0b01, 0x16), 1, &OpDispatchBuilder::AVX128_VPERMD}, {OPD(2, 0b01, 0x17), 1, &OpDispatchBuilder::AVX128_PTest}, {OPD(2, 0b01, 0x18), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i32Bit>}, {OPD(2, 0b01, 0x19), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i64Bit>}, {OPD(2, 0b01, 0x1A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i128Bit>}, {OPD(2, 0b01, 0x1C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VABS, OpSize::i8Bit>}, {OPD(2, 0b01, 0x1D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VABS, OpSize::i16Bit>}, {OPD(2, 0b01, 0x1E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorUnary, IR::OP_VABS, OpSize::i32Bit>}, {OPD(2, 0b01, 0x20), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i8Bit, OpSize::i16Bit, true>}, {OPD(2, 0b01, 0x21), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i8Bit, OpSize::i32Bit, true>}, {OPD(2, 0b01, 0x22), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i8Bit, OpSize::i64Bit, true>}, {OPD(2, 0b01, 0x23), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i16Bit, OpSize::i32Bit, true>}, {OPD(2, 0b01, 0x24), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i16Bit, OpSize::i64Bit, true>}, {OPD(2, 0b01, 0x25), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i32Bit, OpSize::i64Bit, true>}, {OPD(2, 0b01, 0x28), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPMULL, OpSize::i32Bit, true>}, {OPD(2, 0b01, 0x29), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPEQ, OpSize::i64Bit>}, {OPD(2, 0b01, 0x2A), 1, &OpDispatchBuilder::AVX128_MOVVectorNT}, {OPD(2, 0b01, 0x2B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPACKUS, OpSize::i32Bit>}, {OPD(2, 0b01, 0x2C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VMASKMOV, OpSize::i32Bit, false>}, {OPD(2, 0b01, 0x2D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VMASKMOV, OpSize::i64Bit, false>}, {OPD(2, 0b01, 0x2E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VMASKMOV, OpSize::i32Bit, true>}, {OPD(2, 0b01, 0x2F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VMASKMOV, OpSize::i64Bit, true>}, {OPD(2, 0b01, 0x30), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i8Bit, OpSize::i16Bit, false>}, {OPD(2, 0b01, 0x31), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i8Bit, OpSize::i32Bit, false>}, {OPD(2, 0b01, 0x32), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i8Bit, OpSize::i64Bit, false>}, {OPD(2, 0b01, 0x33), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i16Bit, OpSize::i32Bit, false>}, {OPD(2, 0b01, 0x34), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i16Bit, OpSize::i64Bit, false>}, {OPD(2, 0b01, 0x35), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ExtendVectorElements, OpSize::i32Bit, OpSize::i64Bit, false>}, {OPD(2, 0b01, 0x36), 1, &OpDispatchBuilder::AVX128_VPERMD}, {OPD(2, 0b01, 0x37), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VCMPGT, OpSize::i64Bit>}, {OPD(2, 0b01, 0x38), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSMIN, OpSize::i8Bit>}, {OPD(2, 0b01, 0x39), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSMIN, OpSize::i32Bit>}, {OPD(2, 0b01, 0x3A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUMIN, OpSize::i16Bit>}, {OPD(2, 0b01, 0x3B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUMIN, OpSize::i32Bit>}, {OPD(2, 0b01, 0x3C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSMAX, OpSize::i8Bit>}, {OPD(2, 0b01, 0x3D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VSMAX, OpSize::i32Bit>}, {OPD(2, 0b01, 0x3E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUMAX, OpSize::i16Bit>}, {OPD(2, 0b01, 0x3F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VUMAX, OpSize::i32Bit>}, {OPD(2, 0b01, 0x40), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VMUL, OpSize::i32Bit>}, {OPD(2, 0b01, 0x41), 1, &OpDispatchBuilder::AVX128_PHMINPOSUW}, {OPD(2, 0b01, 0x45), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VariableShiftImpl, IROps::OP_VUSHR>}, // VPSRLV {OPD(2, 0b01, 0x46), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VariableShiftImpl, IROps::OP_VSSHR>}, // VPSRAVD {OPD(2, 0b01, 0x47), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VariableShiftImpl, IROps::OP_VUSHL>}, // VPSLLV {OPD(2, 0b01, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i32Bit>}, {OPD(2, 0b01, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i64Bit>}, {OPD(2, 0b01, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i128Bit>}, {OPD(2, 0b01, 0x78), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i8Bit>}, {OPD(2, 0b01, 0x79), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBROADCAST, OpSize::i16Bit>}, {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPMASKMOV, false>}, {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPMASKMOV, true>}, {OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPGATHER, OpSize::i32Bit>}, {OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPGATHER, OpSize::i64Bit>}, {OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPGATHER, OpSize::i32Bit>}, {OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPGATHER, OpSize::i64Bit>}, {OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, true, 1, 3, 2>}, // VFMADDSUB {OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, false, 1, 3, 2>}, // VFMSUBADD {OPD(2, 0b01, 0x98), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, 1, 3, 2>}, // VFMADD {OPD(2, 0b01, 0x99), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLASCALARINSERT, 1, 3, 2>}, // VFMADD {OPD(2, 0b01, 0x9A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, 1, 3, 2>}, // VFMSUB {OPD(2, 0b01, 0x9B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLSSCALARINSERT, 1, 3, 2>}, // VFMSUB {OPD(2, 0b01, 0x9C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, 1, 3, 2>}, // VFNMADD {OPD(2, 0b01, 0x9D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLASCALARINSERT, 1, 3, 2>}, // VFNMADD {OPD(2, 0b01, 0x9E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, 1, 3, 2>}, // VFNMSUB {OPD(2, 0b01, 0x9F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLSSCALARINSERT, 1, 3, 2>}, // VFNMSUB {OPD(2, 0b01, 0xA8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, 2, 1, 3>}, // VFMADD {OPD(2, 0b01, 0xA9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLASCALARINSERT, 2, 1, 3>}, // VFMADD {OPD(2, 0b01, 0xAA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, 2, 1, 3>}, // VFMSUB {OPD(2, 0b01, 0xAB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLSSCALARINSERT, 2, 1, 3>}, // VFMSUB {OPD(2, 0b01, 0xAC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, 2, 1, 3>}, // VFNMADD {OPD(2, 0b01, 0xAD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLASCALARINSERT, 2, 1, 3>}, // VFNMADD {OPD(2, 0b01, 0xAE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, 2, 1, 3>}, // VFNMSUB {OPD(2, 0b01, 0xAF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLSSCALARINSERT, 2, 1, 3>}, // VFNMSUB {OPD(2, 0b01, 0xB8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, 2, 3, 1>}, // VFMADD {OPD(2, 0b01, 0xB9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLASCALARINSERT, 2, 3, 1>}, // VFMADD {OPD(2, 0b01, 0xBA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, 2, 3, 1>}, // VFMSUB {OPD(2, 0b01, 0xBB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLSSCALARINSERT, 2, 3, 1>}, // VFMSUB {OPD(2, 0b01, 0xBC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, 2, 3, 1>}, // VFNMADD {OPD(2, 0b01, 0xBD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLASCALARINSERT, 2, 3, 1>}, // VFNMADD {OPD(2, 0b01, 0xBE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, 2, 3, 1>}, // VFNMSUB {OPD(2, 0b01, 0xBF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLSSCALARINSERT, 2, 3, 1>}, // VFNMSUB {OPD(2, 0b01, 0xA6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, true, 2, 1, 3>}, // VFMADDSUB {OPD(2, 0b01, 0xA7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, false, 2, 1, 3>}, // VFMSUBADD {OPD(2, 0b01, 0xB6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, true, 2, 3, 1>}, // VFMADDSUB {OPD(2, 0b01, 0xB7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, false, 2, 3, 1>}, // VFMSUBADD {OPD(2, 0b01, 0xDB), 1, &OpDispatchBuilder::AVX128_VAESImc}, {OPD(2, 0b01, 0xDC), 1, &OpDispatchBuilder::AVX128_VAESEnc}, {OPD(2, 0b01, 0xDD), 1, &OpDispatchBuilder::AVX128_VAESEncLast}, {OPD(2, 0b01, 0xDE), 1, &OpDispatchBuilder::AVX128_VAESDec}, {OPD(2, 0b01, 0xDF), 1, &OpDispatchBuilder::AVX128_VAESDecLast}, {OPD(3, 0b01, 0x00), 1, &OpDispatchBuilder::AVX128_VPERMQ}, {OPD(3, 0b01, 0x01), 1, &OpDispatchBuilder::AVX128_VPERMQ}, {OPD(3, 0b01, 0x02), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBLEND, OpSize::i32Bit>}, {OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPERMILImm, OpSize::i32Bit>}, {OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VPERMILImm, OpSize::i64Bit>}, {OPD(3, 0b01, 0x06), 1, &OpDispatchBuilder::AVX128_VPERM2}, {OPD(3, 0b01, 0x08), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorRound, OpSize::i32Bit>}, {OPD(3, 0b01, 0x09), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorRound, OpSize::i64Bit>}, {OPD(3, 0b01, 0x0A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertScalarRound, OpSize::i32Bit>}, {OPD(3, 0b01, 0x0B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_InsertScalarRound, OpSize::i64Bit>}, {OPD(3, 0b01, 0x0C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBLEND, OpSize::i32Bit>}, {OPD(3, 0b01, 0x0D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBLEND, OpSize::i64Bit>}, {OPD(3, 0b01, 0x0E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VBLEND, OpSize::i16Bit>}, {OPD(3, 0b01, 0x0F), 1, &OpDispatchBuilder::AVX128_VPALIGNR}, {OPD(3, 0b01, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_PExtr, OpSize::i8Bit>}, {OPD(3, 0b01, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_PExtr, OpSize::i16Bit>}, {OPD(3, 0b01, 0x16), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_PExtr, OpSize::i32Bit>}, {OPD(3, 0b01, 0x17), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_PExtr, OpSize::i32Bit>}, {OPD(3, 0b01, 0x18), 1, &OpDispatchBuilder::AVX128_VINSERT}, {OPD(3, 0b01, 0x19), 1, &OpDispatchBuilder::AVX128_VEXTRACT128}, {OPD(3, 0b01, 0x1D), 1, &OpDispatchBuilder::AVX128_VCVTPS2PH}, {OPD(3, 0b01, 0x20), 1, &OpDispatchBuilder::AVX128_VPINSRB}, {OPD(3, 0b01, 0x21), 1, &OpDispatchBuilder::AVX128_VINSERTPS}, {OPD(3, 0b01, 0x22), 1, &OpDispatchBuilder::AVX128_VPINSRDQ}, {OPD(3, 0b01, 0x38), 1, &OpDispatchBuilder::AVX128_VINSERT}, {OPD(3, 0b01, 0x39), 1, &OpDispatchBuilder::AVX128_VEXTRACT128}, {OPD(3, 0b01, 0x40), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VDPP, OpSize::i32Bit>}, {OPD(3, 0b01, 0x41), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VDPP, OpSize::i64Bit>}, {OPD(3, 0b01, 0x42), 1, &OpDispatchBuilder::AVX128_VMPSADBW}, {OPD(3, 0b01, 0x44), 1, &OpDispatchBuilder::AVX128_VPCLMULQDQ}, {OPD(3, 0b01, 0x46), 1, &OpDispatchBuilder::AVX128_VPERM2}, {OPD(3, 0b01, 0x4A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorVariableBlend, OpSize::i32Bit>}, {OPD(3, 0b01, 0x4B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorVariableBlend, OpSize::i64Bit>}, {OPD(3, 0b01, 0x4C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorVariableBlend, OpSize::i8Bit>}, {OPD(3, 0b01, 0x60), 1, &OpDispatchBuilder::AVX128_VPCMPESTRM}, {OPD(3, 0b01, 0x61), 1, &OpDispatchBuilder::AVX128_VPCMPESTRI}, {OPD(3, 0b01, 0x62), 1, &OpDispatchBuilder::AVX128_VPCMPISTRM}, {OPD(3, 0b01, 0x63), 1, &OpDispatchBuilder::AVX128_VPCMPISTRI}, {OPD(3, 0b01, 0xDF), 1, &OpDispatchBuilder::AVX128_VAESKeyGenAssist}, }; #undef OPD #define OPD(group, pp, opcode) (((group - X86Tables::TYPE_VEX_GROUP_12) << 4) | (pp << 3) | (opcode)) constexpr DispatchTableEntry TableGroupOps[] { // VPSRLI {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b010), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i16Bit, IROps::OP_VUSHRI>}, // VPSLLI {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b110), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i16Bit, IROps::OP_VSHLI>}, // VPSRAI {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b100), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i16Bit, IROps::OP_VSSHRI>}, // VPSRLI {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b010), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i32Bit, IROps::OP_VUSHRI>}, // VPSLLI {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b110), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i32Bit, IROps::OP_VSHLI>}, // VPSRAI {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b100), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i32Bit, IROps::OP_VSSHRI>}, // VPSRLI {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b010), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i64Bit, IROps::OP_VUSHRI>}, // VPSRLDQ {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b011), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ShiftDoubleImm, OpDispatchBuilder::ShiftDirection::RIGHT>}, // VPSLLI {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b110), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorShiftImmImpl, OpSize::i64Bit, IROps::OP_VSHLI>}, // VPSLLDQ {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b111), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_ShiftDoubleImm, OpDispatchBuilder::ShiftDirection::LEFT>}, ///< Use the regular implementation. It just happens to be in the VEX table. {OPD(X86Tables::TYPE_VEX_GROUP_15, 0, 0b010), 1, &OpDispatchBuilder::LDMXCSR}, {OPD(X86Tables::TYPE_VEX_GROUP_15, 0, 0b011), 1, &OpDispatchBuilder::STMXCSR}, }; #undef OPD } namespace AVX256 { using namespace IR; #define OPD(map_select, pp, opcode) (((map_select - 1) << 10) | (pp << 8) | (opcode)) constexpr DispatchTableEntry BaseTable[] = { {OPD(1, 0b00, 0x10), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp}, {OPD(1, 0b01, 0x10), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp}, {OPD(1, 0b10, 0x10), 1, &OpDispatchBuilder::VMOVSSOp}, {OPD(1, 0b11, 0x10), 1, &OpDispatchBuilder::VMOVSDOp}, {OPD(1, 0b00, 0x11), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp}, {OPD(1, 0b01, 0x11), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp}, {OPD(1, 0b10, 0x11), 1, &OpDispatchBuilder::VMOVSSOp}, {OPD(1, 0b11, 0x11), 1, &OpDispatchBuilder::VMOVSDOp}, {OPD(1, 0b00, 0x12), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b01, 0x12), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b10, 0x12), 1, &OpDispatchBuilder::VMOVSLDUPOp}, {OPD(1, 0b11, 0x12), 1, &OpDispatchBuilder::VMOVDDUPOp}, {OPD(1, 0b00, 0x13), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b01, 0x13), 1, &OpDispatchBuilder::VMOVLPOp}, {OPD(1, 0b00, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKLOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKLOp, OpSize::i64Bit>}, {OPD(1, 0b00, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKHOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKHOp, OpSize::i64Bit>}, {OPD(1, 0b00, 0x16), 1, &OpDispatchBuilder::VMOVHPOp}, {OPD(1, 0b01, 0x16), 1, &OpDispatchBuilder::VMOVHPOp}, {OPD(1, 0b10, 0x16), 1, &OpDispatchBuilder::VMOVSHDUPOp}, {OPD(1, 0b00, 0x17), 1, &OpDispatchBuilder::VMOVHPOp}, {OPD(1, 0b01, 0x17), 1, &OpDispatchBuilder::VMOVHPOp}, {OPD(1, 0b00, 0x28), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp}, {OPD(1, 0b01, 0x28), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp}, {OPD(1, 0b00, 0x29), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp}, {OPD(1, 0b01, 0x29), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp}, {OPD(1, 0b10, 0x2A), 1, &OpDispatchBuilder::AVXInsertCVTGPR_To_FPR}, {OPD(1, 0b11, 0x2A), 1, &OpDispatchBuilder::AVXInsertCVTGPR_To_FPR}, {OPD(1, 0b00, 0x2B), 1, &OpDispatchBuilder::MOVVectorNTOp}, {OPD(1, 0b01, 0x2B), 1, &OpDispatchBuilder::MOVVectorNTOp}, {OPD(1, 0b10, 0x2C), 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {OPD(1, 0b11, 0x2C), 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {OPD(1, 0b10, 0x2D), 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {OPD(1, 0b11, 0x2D), 1, &OpDispatchBuilder::CVTFPR_To_GPR}, {OPD(1, 0b00, 0x2E), 1, &OpDispatchBuilder::UCOMISxOp}, {OPD(1, 0b01, 0x2E), 1, &OpDispatchBuilder::UCOMISxOp}, {OPD(1, 0b00, 0x2F), 1, &OpDispatchBuilder::UCOMISxOp}, {OPD(1, 0b01, 0x2F), 1, &OpDispatchBuilder::UCOMISxOp}, {OPD(1, 0b00, 0x50), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVMSKOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x50), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVMSKOp, OpSize::i64Bit>}, {OPD(1, 0b00, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VFSQRT, OpSize::i32Bit>}, {OPD(1, 0b01, 0x51), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VFSQRT, OpSize::i64Bit>}, {OPD(1, 0b10, 0x51), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, {OPD(1, 0b11, 0x51), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, {OPD(1, 0b00, 0x52), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>}, {OPD(1, 0b10, 0x52), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, {OPD(1, 0b00, 0x53), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>}, {OPD(1, 0b10, 0x53), 1, &OpDispatchBuilder::AVXVectorScalarUnaryInsertALUOp}, {OPD(1, 0b00, 0x54), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VAND, OpSize::i128Bit>}, {OPD(1, 0b01, 0x54), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VAND, OpSize::i128Bit>}, {OPD(1, 0b00, 0x55), 1, &OpDispatchBuilder::VANDNOp}, {OPD(1, 0b01, 0x55), 1, &OpDispatchBuilder::VANDNOp}, {OPD(1, 0b00, 0x56), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VOR, OpSize::i128Bit>}, {OPD(1, 0b01, 0x56), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VOR, OpSize::i128Bit>}, {OPD(1, 0b00, 0x57), 1, &OpDispatchBuilder::AVXVectorXOROp}, {OPD(1, 0b01, 0x57), 1, &OpDispatchBuilder::AVXVectorXOROp}, {OPD(1, 0b00, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFADD, OpSize::i32Bit>}, {OPD(1, 0b01, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFADD, OpSize::i64Bit>}, {OPD(1, 0b10, 0x58), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b11, 0x58), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b00, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFMUL, OpSize::i32Bit>}, {OPD(1, 0b01, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFMUL, OpSize::i64Bit>}, {OPD(1, 0b10, 0x59), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b11, 0x59), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b00, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Vector_CVT_Float_To_Float, OpSize::i64Bit, OpSize::i32Bit, true>}, {OPD(1, 0b01, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Vector_CVT_Float_To_Float, OpSize::i32Bit, OpSize::i64Bit, true>}, {OPD(1, 0b10, 0x5A), 1, &OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float}, {OPD(1, 0b11, 0x5A), 1, &OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float}, {OPD(1, 0b00, 0x5B), 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, {OPD(1, 0b01, 0x5B), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {OPD(1, 0b10, 0x5B), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {OPD(1, 0b00, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFSUB, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5C), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b11, 0x5C), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b00, 0x5D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFMIN, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFMIN, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5D), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b11, 0x5D), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b00, 0x5E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFDIV, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFDIV, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5E), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b11, 0x5E), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b00, 0x5F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFMAX, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFMAX, OpSize::i64Bit>}, {OPD(1, 0b10, 0x5F), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b11, 0x5F), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp}, {OPD(1, 0b01, 0x60), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKLOp, OpSize::i8Bit>}, {OPD(1, 0b01, 0x61), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKLOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0x62), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKLOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x63), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPACKSSOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0x64), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPGT, OpSize::i8Bit>}, {OPD(1, 0b01, 0x65), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPGT, OpSize::i16Bit>}, {OPD(1, 0b01, 0x66), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPGT, OpSize::i32Bit>}, {OPD(1, 0b01, 0x67), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPACKUSOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0x68), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKHOp, OpSize::i8Bit>}, {OPD(1, 0b01, 0x69), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKHOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0x6A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKHOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPACKSSOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKLOp, OpSize::i64Bit>}, {OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPUNPCKHOp, OpSize::i64Bit>}, {OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::AVX>}, {OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp}, {OPD(1, 0b10, 0x6F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp}, {OPD(1, 0b01, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSHUFWOp, OpSize::i32Bit, true>}, {OPD(1, 0b10, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSHUFWOp, OpSize::i16Bit, false>}, {OPD(1, 0b11, 0x70), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSHUFWOp, OpSize::i16Bit, true>}, {OPD(1, 0b01, 0x74), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPEQ, OpSize::i8Bit>}, {OPD(1, 0b01, 0x75), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPEQ, OpSize::i16Bit>}, {OPD(1, 0b01, 0x76), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPEQ, OpSize::i32Bit>}, {OPD(1, 0b00, 0x77), 1, &OpDispatchBuilder::VZEROOp}, {OPD(1, 0b01, 0x7C), 1, &OpDispatchBuilder::VHADDPOp}, {OPD(1, 0b11, 0x7C), 1, &OpDispatchBuilder::VHADDPOp}, {OPD(1, 0b01, 0x7D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VHSUBPOp, OpSize::i64Bit>}, {OPD(1, 0b11, 0x7D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VHSUBPOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0x7E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVBetweenGPR_FPR, OpDispatchBuilder::VectorOpType::AVX>}, {OPD(1, 0b10, 0x7E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::AVX>}, {OPD(1, 0b01, 0x7F), 1, &OpDispatchBuilder::VMOVAPS_VMOVAPDOp}, {OPD(1, 0b10, 0x7F), 1, &OpDispatchBuilder::VMOVUPS_VMOVUPDOp}, {OPD(1, 0b00, 0xC2), 1, &OpDispatchBuilder::AVXVFCMPOp}, {OPD(1, 0b01, 0xC2), 1, &OpDispatchBuilder::AVXVFCMPOp}, {OPD(1, 0b10, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp}, {OPD(1, 0b11, 0xC2), 1, &OpDispatchBuilder::AVXInsertScalarFCMPOp}, {OPD(1, 0b01, 0xC4), 1, &OpDispatchBuilder::VPINSRWOp}, {OPD(1, 0b01, 0xC5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i16Bit>}, {OPD(1, 0b00, 0xC6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VSHUFOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0xC6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VSHUFOp, OpSize::i64Bit>}, {OPD(1, 0b01, 0xD0), 1, &OpDispatchBuilder::VADDSUBPOp}, {OPD(1, 0b11, 0xD0), 1, &OpDispatchBuilder::VADDSUBPOp}, {OPD(1, 0b01, 0xD1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRLDOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0xD2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRLDOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0xD3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRLDOp, OpSize::i64Bit>}, {OPD(1, 0b01, 0xD4), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VADD, OpSize::i64Bit>}, {OPD(1, 0b01, 0xD5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VMUL, OpSize::i16Bit>}, {OPD(1, 0b01, 0xD6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVQOp, OpDispatchBuilder::VectorOpType::AVX>}, {OPD(1, 0b01, 0xD7), 1, &OpDispatchBuilder::MOVMSKOpOne}, {OPD(1, 0b01, 0xD8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUQSUB, OpSize::i8Bit>}, {OPD(1, 0b01, 0xD9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUQSUB, OpSize::i16Bit>}, {OPD(1, 0b01, 0xDA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUMIN, OpSize::i8Bit>}, {OPD(1, 0b01, 0xDB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VAND, OpSize::i128Bit>}, {OPD(1, 0b01, 0xDC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUQADD, OpSize::i8Bit>}, {OPD(1, 0b01, 0xDD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUQADD, OpSize::i16Bit>}, {OPD(1, 0b01, 0xDE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUMAX, OpSize::i8Bit>}, {OPD(1, 0b01, 0xDF), 1, &OpDispatchBuilder::VANDNOp}, {OPD(1, 0b01, 0xE0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VURAVG, OpSize::i8Bit>}, {OPD(1, 0b01, 0xE1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRAOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0xE2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRAOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0xE3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VURAVG, OpSize::i16Bit>}, {OPD(1, 0b01, 0xE4), 1, &OpDispatchBuilder::VPMULHWOp}, {OPD(1, 0b01, 0xE5), 1, &OpDispatchBuilder::VPMULHWOp}, {OPD(1, 0b01, 0xE6), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {OPD(1, 0b10, 0xE6), 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, {OPD(1, 0b11, 0xE6), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {OPD(1, 0b01, 0xE7), 1, &OpDispatchBuilder::MOVVectorNTOp}, {OPD(1, 0b01, 0xE8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSQSUB, OpSize::i8Bit>}, {OPD(1, 0b01, 0xE9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSQSUB, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSMIN, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VOR, OpSize::i128Bit>}, {OPD(1, 0b01, 0xEC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSQADD, OpSize::i8Bit>}, {OPD(1, 0b01, 0xED), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSQADD, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSMAX, OpSize::i16Bit>}, {OPD(1, 0b01, 0xEF), 1, &OpDispatchBuilder::AVXVectorXOROp}, {OPD(1, 0b11, 0xF0), 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, {OPD(1, 0b01, 0xF1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSLLOp, OpSize::i16Bit>}, {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSLLOp, OpSize::i32Bit>}, {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSLLOp, OpSize::i64Bit>}, {OPD(1, 0b01, 0xF4), 1, &OpDispatchBuilder::VPMULLOp}, {OPD(1, 0b01, 0xF5), 1, &OpDispatchBuilder::VPMADDWDOp}, {OPD(1, 0b01, 0xF6), 1, &OpDispatchBuilder::VPSADBWOp}, {OPD(1, 0b01, 0xF7), 1, &OpDispatchBuilder::MASKMOVOp}, {OPD(1, 0b01, 0xF8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSUB, OpSize::i8Bit>}, {OPD(1, 0b01, 0xF9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSUB, OpSize::i16Bit>}, {OPD(1, 0b01, 0xFA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSUB, OpSize::i32Bit>}, {OPD(1, 0b01, 0xFB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSUB, OpSize::i64Bit>}, {OPD(1, 0b01, 0xFC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VADD, OpSize::i8Bit>}, {OPD(1, 0b01, 0xFD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VADD, OpSize::i16Bit>}, {OPD(1, 0b01, 0xFE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VADD, OpSize::i32Bit>}, {OPD(2, 0b01, 0x00), 1, &OpDispatchBuilder::VPSHUFBOp}, {OPD(2, 0b01, 0x01), 1, &OpDispatchBuilder::VHADDPOp}, {OPD(2, 0b01, 0x02), 1, &OpDispatchBuilder::VHADDPOp}, {OPD(2, 0b01, 0x03), 1, &OpDispatchBuilder::VPHADDSWOp}, {OPD(2, 0b01, 0x04), 1, &OpDispatchBuilder::VPMADDUBSWOp}, {OPD(2, 0b01, 0x05), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPHSUBOp, OpSize::i16Bit>}, {OPD(2, 0b01, 0x06), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPHSUBOp, OpSize::i32Bit>}, {OPD(2, 0b01, 0x07), 1, &OpDispatchBuilder::VPHSUBSWOp}, {OPD(2, 0b01, 0x08), 1, &OpDispatchBuilder::VPSIGN}, {OPD(2, 0b01, 0x09), 1, &OpDispatchBuilder::VPSIGN}, {OPD(2, 0b01, 0x0A), 1, &OpDispatchBuilder::VPSIGN}, {OPD(2, 0b01, 0x0B), 1, &OpDispatchBuilder::VPMULHRSWOp}, {OPD(2, 0b01, 0x0C), 1, &OpDispatchBuilder::VPERMILRegOp}, {OPD(2, 0b01, 0x0D), 1, &OpDispatchBuilder::VPERMILRegOp}, {OPD(2, 0b01, 0x0E), 1, &OpDispatchBuilder::VTESTPOp}, {OPD(2, 0b01, 0x0F), 1, &OpDispatchBuilder::VTESTPOp}, {OPD(2, 0b01, 0x13), 1, &OpDispatchBuilder::VCVTPH2PSOp}, {OPD(2, 0b01, 0x16), 1, &OpDispatchBuilder::VPERMDOp}, {OPD(2, 0b01, 0x17), 1, &OpDispatchBuilder::PTestOp}, {OPD(2, 0b01, 0x18), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i32Bit>}, {OPD(2, 0b01, 0x19), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i64Bit>}, {OPD(2, 0b01, 0x1A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i128Bit>}, {OPD(2, 0b01, 0x1C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VABS, OpSize::i8Bit>}, {OPD(2, 0b01, 0x1D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VABS, OpSize::i16Bit>}, {OPD(2, 0b01, 0x1E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorUnaryOp, IR::OP_VABS, OpSize::i32Bit>}, {OPD(2, 0b01, 0x20), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x21), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x22), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x23), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x24), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x25), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x28), 1, &OpDispatchBuilder::VPMULLOp}, {OPD(2, 0b01, 0x29), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPEQ, OpSize::i64Bit>}, {OPD(2, 0b01, 0x2A), 1, &OpDispatchBuilder::MOVVectorNTOp}, {OPD(2, 0b01, 0x2B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPACKUSOp, OpSize::i32Bit>}, {OPD(2, 0b01, 0x2C), 1, &OpDispatchBuilder::VMASKMOVOp}, {OPD(2, 0b01, 0x2D), 1, &OpDispatchBuilder::VMASKMOVOp}, {OPD(2, 0b01, 0x2E), 1, &OpDispatchBuilder::VMASKMOVOp}, {OPD(2, 0b01, 0x2F), 1, &OpDispatchBuilder::VMASKMOVOp}, {OPD(2, 0b01, 0x30), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x31), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x32), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x33), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x34), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x35), 1, &OpDispatchBuilder::ExtendVectorElements}, {OPD(2, 0b01, 0x36), 1, &OpDispatchBuilder::VPERMDOp}, {OPD(2, 0b01, 0x37), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VCMPGT, OpSize::i64Bit>}, {OPD(2, 0b01, 0x38), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSMIN, OpSize::i8Bit>}, {OPD(2, 0b01, 0x39), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSMIN, OpSize::i32Bit>}, {OPD(2, 0b01, 0x3A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUMIN, OpSize::i16Bit>}, {OPD(2, 0b01, 0x3B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUMIN, OpSize::i32Bit>}, {OPD(2, 0b01, 0x3C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSMAX, OpSize::i8Bit>}, {OPD(2, 0b01, 0x3D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VSMAX, OpSize::i32Bit>}, {OPD(2, 0b01, 0x3E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUMAX, OpSize::i16Bit>}, {OPD(2, 0b01, 0x3F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VUMAX, OpSize::i32Bit>}, {OPD(2, 0b01, 0x40), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VMUL, OpSize::i32Bit>}, {OPD(2, 0b01, 0x41), 1, &OpDispatchBuilder::PHMINPOSUWOp}, {OPD(2, 0b01, 0x45), 1, &OpDispatchBuilder::VPSRLVOp}, {OPD(2, 0b01, 0x46), 1, &OpDispatchBuilder::VPSRAVDOp}, {OPD(2, 0b01, 0x47), 1, &OpDispatchBuilder::VPSLLVOp}, {OPD(2, 0b01, 0x58), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i32Bit>}, {OPD(2, 0b01, 0x59), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i64Bit>}, {OPD(2, 0b01, 0x5A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i128Bit>}, {OPD(2, 0b01, 0x78), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i8Bit>}, {OPD(2, 0b01, 0x79), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VBROADCASTOp, OpSize::i16Bit>}, {OPD(2, 0b01, 0x8C), 1, &OpDispatchBuilder::VPMASKMOVOp}, {OPD(2, 0b01, 0x8E), 1, &OpDispatchBuilder::VPMASKMOVOp}, {OPD(2, 0b01, 0x90), 1, &OpDispatchBuilder::VPGATHER}, {OPD(2, 0b01, 0x91), 1, &OpDispatchBuilder::VPGATHER}, {OPD(2, 0b01, 0x92), 1, &OpDispatchBuilder::VPGATHER}, {OPD(2, 0b01, 0x93), 1, &OpDispatchBuilder::VPGATHER}, {OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAddSubImpl, true, 1, 3, 2>}, // VFMADDSUB {OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAddSubImpl, false, 1, 3, 2>}, // VFMSUBADD {OPD(2, 0b01, 0x98), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLA, false, 1, 3, 2>}, // VFMADD {OPD(2, 0b01, 0x99), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLA, true, 1, 3, 2>}, // VFMADD {OPD(2, 0b01, 0x9A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLS, false, 1, 3, 2>}, // VFMSUB {OPD(2, 0b01, 0x9B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLS, true, 1, 3, 2>}, // VFMSUB {OPD(2, 0b01, 0x9C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLA, false, 1, 3, 2>}, // VFNMADD {OPD(2, 0b01, 0x9D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLA, true, 1, 3, 2>}, // VFNMADD {OPD(2, 0b01, 0x9E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLS, false, 1, 3, 2>}, // VFNMSUB {OPD(2, 0b01, 0x9F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLS, true, 1, 3, 2>}, // VFNMSUB {OPD(2, 0b01, 0xA8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLA, false, 2, 1, 3>}, // VFMADD {OPD(2, 0b01, 0xA9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLA, true, 2, 1, 3>}, // VFMADD {OPD(2, 0b01, 0xAA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLS, false, 2, 1, 3>}, // VFMSUB {OPD(2, 0b01, 0xAB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLS, true, 2, 1, 3>}, // VFMSUB {OPD(2, 0b01, 0xAC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLA, false, 2, 1, 3>}, // VFNMADD {OPD(2, 0b01, 0xAD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLA, true, 2, 1, 3>}, // VFNMADD {OPD(2, 0b01, 0xAE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLS, false, 2, 1, 3>}, // VFNMSUB {OPD(2, 0b01, 0xAF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLS, true, 2, 1, 3>}, // VFNMSUB {OPD(2, 0b01, 0xB8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLA, false, 2, 3, 1>}, // VFMADD {OPD(2, 0b01, 0xB9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLA, true, 2, 3, 1>}, // VFMADD {OPD(2, 0b01, 0xBA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLS, false, 2, 3, 1>}, // VFMSUB {OPD(2, 0b01, 0xBB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFMLS, true, 2, 3, 1>}, // VFMSUB {OPD(2, 0b01, 0xBC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLA, false, 2, 3, 1>}, // VFNMADD {OPD(2, 0b01, 0xBD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLA, true, 2, 3, 1>}, // VFNMADD {OPD(2, 0b01, 0xBE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLS, false, 2, 3, 1>}, // VFNMSUB {OPD(2, 0b01, 0xBF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAImpl, IR::OP_VFNMLS, true, 2, 3, 1>}, // VFNMSUB {OPD(2, 0b01, 0xA6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAddSubImpl, true, 2, 1, 3>}, // VFMADDSUB {OPD(2, 0b01, 0xA7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAddSubImpl, false, 2, 1, 3>}, // VFMSUBADD {OPD(2, 0b01, 0xB6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAddSubImpl, true, 2, 3, 1>}, // VFMADDSUB {OPD(2, 0b01, 0xB7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VFMAddSubImpl, false, 2, 3, 1>}, // VFMSUBADD {OPD(2, 0b01, 0xDB), 1, &OpDispatchBuilder::AESImcOp}, {OPD(2, 0b01, 0xDC), 1, &OpDispatchBuilder::VAESEncOp}, {OPD(2, 0b01, 0xDD), 1, &OpDispatchBuilder::VAESEncLastOp}, {OPD(2, 0b01, 0xDE), 1, &OpDispatchBuilder::VAESDecOp}, {OPD(2, 0b01, 0xDF), 1, &OpDispatchBuilder::VAESDecLastOp}, {OPD(3, 0b01, 0x00), 1, &OpDispatchBuilder::VPERMQOp}, {OPD(3, 0b01, 0x01), 1, &OpDispatchBuilder::VPERMQOp}, {OPD(3, 0b01, 0x02), 1, &OpDispatchBuilder::VPBLENDDOp}, {OPD(3, 0b01, 0x04), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPERMILImmOp, OpSize::i32Bit>}, {OPD(3, 0b01, 0x05), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPERMILImmOp, OpSize::i64Bit>}, {OPD(3, 0b01, 0x06), 1, &OpDispatchBuilder::VPERM2Op}, {OPD(3, 0b01, 0x08), 1, &OpDispatchBuilder::AVXVectorRound}, {OPD(3, 0b01, 0x09), 1, &OpDispatchBuilder::AVXVectorRound}, {OPD(3, 0b01, 0x0A), 1, &OpDispatchBuilder::AVXInsertScalarRound}, {OPD(3, 0b01, 0x0B), 1, &OpDispatchBuilder::AVXInsertScalarRound}, {OPD(3, 0b01, 0x0C), 1, &OpDispatchBuilder::VPBLENDDOp}, {OPD(3, 0b01, 0x0D), 1, &OpDispatchBuilder::VBLENDPDOp}, {OPD(3, 0b01, 0x0E), 1, &OpDispatchBuilder::VPBLENDWOp}, {OPD(3, 0b01, 0x0F), 1, &OpDispatchBuilder::VPALIGNROp}, {OPD(3, 0b01, 0x14), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i8Bit>}, {OPD(3, 0b01, 0x15), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i16Bit>}, {OPD(3, 0b01, 0x16), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i32Bit>}, {OPD(3, 0b01, 0x17), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::PExtrOp, OpSize::i32Bit>}, {OPD(3, 0b01, 0x18), 1, &OpDispatchBuilder::VINSERTOp}, {OPD(3, 0b01, 0x19), 1, &OpDispatchBuilder::VEXTRACT128Op}, {OPD(3, 0b01, 0x1D), 1, &OpDispatchBuilder::VCVTPS2PHOp}, {OPD(3, 0b01, 0x20), 1, &OpDispatchBuilder::VPINSRBOp}, {OPD(3, 0b01, 0x21), 1, &OpDispatchBuilder::VINSERTPSOp}, {OPD(3, 0b01, 0x22), 1, &OpDispatchBuilder::VPINSRDQOp}, {OPD(3, 0b01, 0x38), 1, &OpDispatchBuilder::VINSERTOp}, {OPD(3, 0b01, 0x39), 1, &OpDispatchBuilder::VEXTRACT128Op}, {OPD(3, 0b01, 0x40), 1, &OpDispatchBuilder::VDPPOp}, {OPD(3, 0b01, 0x41), 1, &OpDispatchBuilder::VDPPOp}, {OPD(3, 0b01, 0x42), 1, &OpDispatchBuilder::VMPSADBWOp}, {OPD(3, 0b01, 0x44), 1, &OpDispatchBuilder::VPCLMULQDQOp}, {OPD(3, 0b01, 0x46), 1, &OpDispatchBuilder::VPERM2Op}, {OPD(3, 0b01, 0x4A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorVariableBlend, OpSize::i32Bit>}, {OPD(3, 0b01, 0x4B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorVariableBlend, OpSize::i64Bit>}, {OPD(3, 0b01, 0x4C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorVariableBlend, OpSize::i8Bit>}, {OPD(3, 0b01, 0x60), 1, &OpDispatchBuilder::VPCMPESTRMOp}, {OPD(3, 0b01, 0x61), 1, &OpDispatchBuilder::VPCMPESTRIOp}, {OPD(3, 0b01, 0x62), 1, &OpDispatchBuilder::VPCMPISTRMOp}, {OPD(3, 0b01, 0x63), 1, &OpDispatchBuilder::VPCMPISTRIOp}, {OPD(3, 0b01, 0xDF), 1, &OpDispatchBuilder::AESKeyGenAssist}, }; #undef OPD #define OPD(group, pp, opcode) (((group - X86Tables::TYPE_VEX_GROUP_12) << 4) | (pp << 3) | (opcode)) constexpr DispatchTableEntry TableGroupOps[] { {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b010), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRLIOp, OpSize::i16Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b110), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSLLIOp, OpSize::i16Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_12, 1, 0b100), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRAIOp, OpSize::i16Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b010), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRLIOp, OpSize::i32Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b110), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSLLIOp, OpSize::i32Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_13, 1, 0b100), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRAIOp, OpSize::i32Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b010), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSRLIOp, OpSize::i64Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b011), 1, &OpDispatchBuilder::VPSRLDQOp}, {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b110), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VPSLLIOp, OpSize::i64Bit>}, {OPD(X86Tables::TYPE_VEX_GROUP_14, 1, 0b111), 1, &OpDispatchBuilder::VPSLLDQOp}, {OPD(X86Tables::TYPE_VEX_GROUP_15, 0, 0b010), 1, &OpDispatchBuilder::LDMXCSR}, {OPD(X86Tables::TYPE_VEX_GROUP_15, 0, 0b011), 1, &OpDispatchBuilder::STMXCSR}, }; #undef OPD } auto BaseTableLambda = [](const auto RuntimeTable) consteval { std::array Table{}; #define OPD(map_select, pp, opcode) (((map_select - 1) << 10) | (pp << 8) | (opcode)) constexpr U16U8InfoStruct VEXTable[] = { // Map 0 (Reserved) // VEX Map 1 {OPD(1, 0b00, 0x10), 1, X86InstInfo{"VMOVUPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x10), 1, X86InstInfo{"VMOVUPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x10), 1, X86InstInfo{"VMOVSS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x10), 1, X86InstInfo{"VMOVSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x11), 1, X86InstInfo{"VMOVUPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x11), 1, X86InstInfo{"VMOVUPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x11), 1, X86InstInfo{"VMOVSS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x11), 1, X86InstInfo{"VMOVSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x12), 1, X86InstInfo{"VMOVLPS",TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b01, 0x12), 1, X86InstInfo{"VMOVLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b10, 0x12), 1, X86InstInfo{"VMOVSLDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0x12), 1, X86InstInfo{"VMOVDDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x13), 1, X86InstInfo{"VMOVLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b01, 0x13), 1, X86InstInfo{"VMOVLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b00, 0x14), 1, X86InstInfo{"VUNPCKLPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x14), 1, X86InstInfo{"VUNPCKLPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x15), 1, X86InstInfo{"VUNPCKHPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x15), 1, X86InstInfo{"VUNPCKHPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x16), 1, X86InstInfo{"VMOV(L)HPS",TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b01, 0x16), 1, X86InstInfo{"VMOVHPD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS | FLAGS_VEX_1ST_SRC | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b10, 0x16), 1, X86InstInfo{"VMOVSHDUP", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x17), 1, X86InstInfo{"VMOVHPS", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b01, 0x17), 1, X86InstInfo{"VMOVHPD", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b00, 0x50), 1, X86InstInfo{"VMOVMSKPS", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {OPD(1, 0b01, 0x50), 1, X86InstInfo{"VMOVMSKPD", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR, 0}}, {OPD(1, 0b00, 0x51), 1, X86InstInfo{"VSQRTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x51), 1, X86InstInfo{"VSQRTPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x51), 1, X86InstInfo{"VSQRTSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x51), 1, X86InstInfo{"VSQRTSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x52), 1, X86InstInfo{"VRSQRTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x52), 1, X86InstInfo{"VRSQRTSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x53), 1, X86InstInfo{"VRCPPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x53), 1, X86InstInfo{"VRCPSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x54), 1, X86InstInfo{"VANDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x54), 1, X86InstInfo{"VANDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x55), 1, X86InstInfo{"VANDNPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x55), 1, X86InstInfo{"VANDNPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x56), 1, X86InstInfo{"VORPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x56), 1, X86InstInfo{"VORPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x57), 1, X86InstInfo{"VXORPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x57), 1, X86InstInfo{"VXORPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x60), 1, X86InstInfo{"VPUNPCKLBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x61), 1, X86InstInfo{"VPUNPCKLWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x62), 1, X86InstInfo{"VPUNPCKLDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x63), 1, X86InstInfo{"VPACKSSWB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x64), 1, X86InstInfo{"VPCMPGTB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x65), 1, X86InstInfo{"VPCMPGTW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x66), 1, X86InstInfo{"VPCMPGTD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x67), 1, X86InstInfo{"VPACKUSWB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x70), 1, X86InstInfo{"VPSHUFD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b10, 0x70), 1, X86InstInfo{"VPSHUFHW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b11, 0x70), 1, X86InstInfo{"VPSHUFLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b01, 0x71), 1, X86InstInfo{"", TYPE_VEX_GROUP_12, FLAGS_NONE, 0}}, // VEX Group 12 {OPD(1, 0b01, 0x72), 1, X86InstInfo{"", TYPE_VEX_GROUP_13, FLAGS_NONE, 0}}, // VEX Group 13 {OPD(1, 0b01, 0x73), 1, X86InstInfo{"", TYPE_VEX_GROUP_14, FLAGS_NONE, 0}}, // VEX Group 14 {OPD(1, 0b01, 0x74), 1, X86InstInfo{"VPCMPEQB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x75), 1, X86InstInfo{"VPCMPEQW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x76), 1, X86InstInfo{"VPCMPEQD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x77), 1, X86InstInfo{"VZERO*", TYPE_INST, GenFlagsDstSize(SIZE_128BIT), 0}}, {OPD(1, 0b00, 0xC2), 1, X86InstInfo{"VCMPccPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b01, 0xC2), 1, X86InstInfo{"VCMPccPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b10, 0xC2), 1, X86InstInfo{"VCMPccSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_L_IGNORE | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b11, 0xC2), 1, X86InstInfo{"VCMPccSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_L_IGNORE | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b01, 0xC4), 1, X86InstInfo{"VPINSRW", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_SF_SRC_GPR | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(1, 0b01, 0xC5), 1, X86InstInfo{"VPEXTRW", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(1, 0b00, 0xC6), 1, X86InstInfo{"VSHUFPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(1, 0b01, 0xC6), 1, X86InstInfo{"VSHUFPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, // The above ops are defined from `Table A-17. VEX Opcode Map 1, Low Nibble = [0h:7h]` of AMD Architecture programmer's manual Volume 3 // This table doesn't state which VEX.pp is for which instruction // XXX: Confirm all the above encoding opcodes {OPD(1, 0b00, 0x28), 1, X86InstInfo{"VMOVAPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x28), 1, X86InstInfo{"VMOVAPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x29), 1, X86InstInfo{"VMOVAPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x29), 1, X86InstInfo{"VMOVAPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x2A), 1, X86InstInfo{"VCVTSI2SS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x2A), 1, X86InstInfo{"VCVTSI2SD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x2B), 1, X86InstInfo{"VMOVNTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x2B), 1, X86InstInfo{"VMOVNTPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x2C), 1, X86InstInfo{"VCVTTSS2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x2C), 1, X86InstInfo{"VCVTTSD2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b10, 0x2D), 1, X86InstInfo{"VCVTSS2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x2D), 1, X86InstInfo{"VCVTSD2SI", TYPE_INST, FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x2E), 1, X86InstInfo{"VUCOMISS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b01, 0x2E), 1, X86InstInfo{"VUCOMISD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x2F), 1, X86InstInfo{"VCOMISS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b01, 0x2F), 1, X86InstInfo{"VCOMISD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x58), 1, X86InstInfo{"VADDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x58), 1, X86InstInfo{"VADDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x58), 1, X86InstInfo{"VADDSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x58), 1, X86InstInfo{"VADDSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x59), 1, X86InstInfo{"VMULPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x59), 1, X86InstInfo{"VMULPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x59), 1, X86InstInfo{"VMULSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x59), 1, X86InstInfo{"VMULSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x5A), 1, X86InstInfo{"VCVTPS2PD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x5A), 1, X86InstInfo{"VCVTPD2PS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x5A), 1, X86InstInfo{"VCVTSS2SD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_L_IGNORE | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0x5A), 1, X86InstInfo{"VCVTSD2SS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_L_IGNORE |FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x5B), 1, X86InstInfo{"VCVTDQ2PS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x5B), 1, X86InstInfo{"VCVTPS2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x5B), 1, X86InstInfo{"VCVTTPS2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0x5C), 1, X86InstInfo{"VSUBPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x5C), 1, X86InstInfo{"VSUBPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x5C), 1, X86InstInfo{"VSUBSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x5C), 1, X86InstInfo{"VSUBSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x5D), 1, X86InstInfo{"VMINPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x5D), 1, X86InstInfo{"VMINPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x5D), 1, X86InstInfo{"VMINSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x5D), 1, X86InstInfo{"VMINSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x5E), 1, X86InstInfo{"VDIVPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x5E), 1, X86InstInfo{"VDIVPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x5E), 1, X86InstInfo{"VDIVSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x5E), 1, X86InstInfo{"VDIVSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b00, 0x5F), 1, X86InstInfo{"VMAXPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x5F), 1, X86InstInfo{"VMAXPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x5F), 1, X86InstInfo{"VMAXSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b11, 0x5F), 1, X86InstInfo{"VMAXSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(1, 0b01, 0x68), 1, X86InstInfo{"VPUNPCKHBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x69), 1, X86InstInfo{"VPUNPCKHWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x6A), 1, X86InstInfo{"VPUNPCKHDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x6B), 1, X86InstInfo{"VPACKSSDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x6C), 1, X86InstInfo{"VPUNPCKLQDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x6D), 1, X86InstInfo{"VPUNPCKHQDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x6E), 1, X86InstInfo{"VMOV*", TYPE_INST, GenFlagsDstSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0 | FLAGS_SF_SRC_GPR, 0}}, {OPD(1, 0b01, 0x6F), 1, X86InstInfo{"VMOVDQA", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x6F), 1, X86InstInfo{"VMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x7C), 1, X86InstInfo{"VHADDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0x7C), 1, X86InstInfo{"VHADDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x7D), 1, X86InstInfo{"VHSUBPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0x7D), 1, X86InstInfo{"VHSUBPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x7E), 1, X86InstInfo{"VMOV*", TYPE_INST, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_VEX_L_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x7E), 1, X86InstInfo{"VMOVQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0x7F), 1, X86InstInfo{"VMOVDQA", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0x7F), 1, X86InstInfo{"VMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b00, 0xAE), 1, X86InstInfo{"", TYPE_VEX_GROUP_15, FLAGS_NONE, 0}}, // VEX Group 15 {OPD(1, 0b01, 0xAE), 1, X86InstInfo{"", TYPE_VEX_GROUP_15, FLAGS_NONE, 0}}, // VEX Group 15 {OPD(1, 0b10, 0xAE), 1, X86InstInfo{"", TYPE_VEX_GROUP_15, FLAGS_NONE, 0}}, // VEX Group 15 {OPD(1, 0b11, 0xAE), 1, X86InstInfo{"", TYPE_VEX_GROUP_15, FLAGS_NONE, 0}}, // VEX Group 15 {OPD(1, 0b01, 0xD0), 1, X86InstInfo{"VADDSUBPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0xD0), 1, X86InstInfo{"VADDSUBPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD1), 1, X86InstInfo{"VPSRLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD2), 1, X86InstInfo{"VPSRLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD3), 1, X86InstInfo{"VPSRLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD4), 1, X86InstInfo{"VPADDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD5), 1, X86InstInfo{"VPMULLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD6), 1, X86InstInfo{"VMOVQ", TYPE_INST, GenFlagsSameSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_L_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD7), 1, X86InstInfo{"VPMOVMSKB", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_SF_DST_GPR | FLAGS_SF_MOD_REG_ONLY, 0}}, {OPD(1, 0b01, 0xD8), 1, X86InstInfo{"VPSUBUSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xD9), 1, X86InstInfo{"VPSUBUSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xDA), 1, X86InstInfo{"VPMINUB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xDB), 1, X86InstInfo{"VPAND", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xDC), 1, X86InstInfo{"VPADDUSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xDD), 1, X86InstInfo{"VPADDUSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xDE), 1, X86InstInfo{"VPMAXUB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xDF), 1, X86InstInfo{"VPANDN", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE0), 1, X86InstInfo{"VPAVGB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE1), 1, X86InstInfo{"VPSRAW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE2), 1, X86InstInfo{"VPSRAD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE3), 1, X86InstInfo{"VPAVGW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE4), 1, X86InstInfo{"VPMULHUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE5), 1, X86InstInfo{"VPMULHW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE6), 1, X86InstInfo{"VCVTTPD2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b10, 0xE6), 1, X86InstInfo{"VCVTDQ2PD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0xE6), 1, X86InstInfo{"VCVTPD2DQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE7), 1, X86InstInfo{"VMOVNTDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE8), 1, X86InstInfo{"VPSUBSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xE9), 1, X86InstInfo{"VPSUBSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xEA), 1, X86InstInfo{"VPMINSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xEB), 1, X86InstInfo{"VPOR", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xEC), 1, X86InstInfo{"VPADDSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xED), 1, X86InstInfo{"VPADDSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xEE), 1, X86InstInfo{"VPMAXSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xEF), 1, X86InstInfo{"VPXOR", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b11, 0xF0), 1, X86InstInfo{"VLDDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF1), 1, X86InstInfo{"VPSLLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF2), 1, X86InstInfo{"VPSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF3), 1, X86InstInfo{"VPSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF4), 1, X86InstInfo{"VPMULUDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF5), 1, X86InstInfo{"VPMADDWD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF6), 1, X86InstInfo{"VPSADBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF7), 1, X86InstInfo{"VMASKMOVDQU", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 0}}, {OPD(1, 0b01, 0xF8), 1, X86InstInfo{"VPSUBB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xF9), 1, X86InstInfo{"VPSUBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xFA), 1, X86InstInfo{"VPSUBD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xFB), 1, X86InstInfo{"VPSUBQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xFC), 1, X86InstInfo{"VPADDB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xFD), 1, X86InstInfo{"VPADDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(1, 0b01, 0xFE), 1, X86InstInfo{"VPADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, // VEX Map 2 {OPD(2, 0b01, 0x00), 1, X86InstInfo{"VPSHUFB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x01), 1, X86InstInfo{"VPHADDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x02), 1, X86InstInfo{"VPHADDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x03), 1, X86InstInfo{"VPHADDSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x04), 1, X86InstInfo{"VPMADDUBSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x05), 1, X86InstInfo{"VPHSUBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x06), 1, X86InstInfo{"VPHSUBD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x07), 1, X86InstInfo{"VPHSUBSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x08), 1, X86InstInfo{"VPSIGNB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x09), 1, X86InstInfo{"VPSIGNW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x0A), 1, X86InstInfo{"VPSIGND", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x0B), 1, X86InstInfo{"VPMULHRSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x0C), 1, X86InstInfo{"VPERMILPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x0D), 1, X86InstInfo{"VPERMILPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x0E), 1, X86InstInfo{"VTESTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x0F), 1, X86InstInfo{"VTESTPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x13), 1, X86InstInfo{"VCVTPH2PS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x16), 1, X86InstInfo{"VPERMPS", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x17), 1, X86InstInfo{"VPTEST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x18), 1, X86InstInfo{"VBROADCASTSS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x19), 1, X86InstInfo{"VBROADCASTSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x1A), 1, X86InstInfo{"VBROADCASTF128", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_SF_MOD_MEM_ONLY | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x1C), 1, X86InstInfo{"VPABSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x1D), 1, X86InstInfo{"VPABSW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x1E), 1, X86InstInfo{"VPABSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x20), 1, X86InstInfo{"VPMOVSXBW", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x21), 1, X86InstInfo{"VPMOVSXBD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x22), 1, X86InstInfo{"VPMOVSXBQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x23), 1, X86InstInfo{"VPMOVSXWD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x24), 1, X86InstInfo{"VPMOVSXWQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x25), 1, X86InstInfo{"VPMOVSXDQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x28), 1, X86InstInfo{"VPMULDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x29), 1, X86InstInfo{"VPCMPEQQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x2A), 1, X86InstInfo{"VMOVNTDQA", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x2B), 1, X86InstInfo{"VPACKUSDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x2C), 1, X86InstInfo{"VMASKMOVPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x2D), 1, X86InstInfo{"VMASKMOVPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x2E), 1, X86InstInfo{"VMASKMOVPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x2F), 1, X86InstInfo{"VMASKMOVPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x30), 1, X86InstInfo{"VPMOVZXBW", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x31), 1, X86InstInfo{"VPMOVZXBD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x32), 1, X86InstInfo{"VPMOVZXBQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_16BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x33), 1, X86InstInfo{"VPMOVZXWD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x34), 1, X86InstInfo{"VPMOVZXWQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x35), 1, X86InstInfo{"VPMOVZXDQ", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x36), 1, X86InstInfo{"VPERMD", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x37), 1, X86InstInfo{"VPCMPGTQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x38), 1, X86InstInfo{"VPMINSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x39), 1, X86InstInfo{"VPMINSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x3A), 1, X86InstInfo{"VPMINUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x3B), 1, X86InstInfo{"VPMINUD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x3C), 1, X86InstInfo{"VPMAXSB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x3D), 1, X86InstInfo{"VPMAXSD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x3E), 1, X86InstInfo{"VPMAXUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x3F), 1, X86InstInfo{"VPMAXUD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x40), 1, X86InstInfo{"VPMULLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x41), 1, X86InstInfo{"VPHMINPOSUW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 0}}, {OPD(2, 0b01, 0x45), 1, X86InstInfo{"VPSRLV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x46), 1, X86InstInfo{"VPSRAVD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x47), 1, X86InstInfo{"VPSLLV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x58), 1, X86InstInfo{"VPBROADCASTD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x59), 1, X86InstInfo{"VPBROADCASTQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x5A), 1, X86InstInfo{"VBROADCASTI128", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_SF_MOD_MEM_ONLY | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x78), 1, X86InstInfo{"VPBROADCASTB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x79), 1, X86InstInfo{"VPBROADCASTW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x8C), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x8E), 1, X86InstInfo{"VPMASKMOV", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_MEM_ONLY | FLAGS_SF_MOD_DST | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x90), 1, X86InstInfo{"VPGATHERDD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x91), 1, X86InstInfo{"VPGATHERQD/Q", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x92), 1, X86InstInfo{"VGATHERDPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x93), 1, X86InstInfo{"VGATHERQPS/D", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_2ND_SRC | FLAGS_VEX_VSIB | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x96), 1, X86InstInfo{"VFMADDSUB132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x97), 1, X86InstInfo{"VFMSUBADD132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x98), 1, X86InstInfo{"VFMADD132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x99), 1, X86InstInfo{"VFMADD132_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0x9A), 1, X86InstInfo{"VFMSUB132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x9B), 1, X86InstInfo{"VFMSUB132_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0x9C), 1, X86InstInfo{"VFNMADD132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x9D), 1, X86InstInfo{"VFNMADD132_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0x9E), 1, X86InstInfo{"VFNMSUB132", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0x9F), 1, X86InstInfo{"VFNMSUB132_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xA8), 1, X86InstInfo{"VFMADD213", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xA9), 1, X86InstInfo{"VFMADD213_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xAA), 1, X86InstInfo{"VFMSUB213", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xAB), 1, X86InstInfo{"VFMSUB213_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xAC), 1, X86InstInfo{"VFNMADD213", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xAD), 1, X86InstInfo{"VFNMADD213_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xAE), 1, X86InstInfo{"VFNMSUB213", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xAF), 1, X86InstInfo{"VFNMSUB213_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xB8), 1, X86InstInfo{"VFMADD231", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xB9), 1, X86InstInfo{"VFMADD231_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xBA), 1, X86InstInfo{"VFMSUB231", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xBB), 1, X86InstInfo{"VFMSUB231_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xBC), 1, X86InstInfo{"VFNMADD231", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xBD), 1, X86InstInfo{"VFNMADD231_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xBE), 1, X86InstInfo{"VFNMSUB231", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xBF), 1, X86InstInfo{"VFNMSUB231_S", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_IGNORE, 0}}, {OPD(2, 0b01, 0xA6), 1, X86InstInfo{"VFMADDSUB213", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xA7), 1, X86InstInfo{"VFMSUBADD213", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xB6), 1, X86InstInfo{"VFMADDSUB231", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xB7), 1, X86InstInfo{"VFMSUBADD231", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xDB), 1, X86InstInfo{"VAESIMC", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xDC), 1, X86InstInfo{"VAESENC", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xDD), 1, X86InstInfo{"VAESENCLAST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xDE), 1, X86InstInfo{"VAESDEC", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b01, 0xDF), 1, X86InstInfo{"VAESDECLAST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 0}}, {OPD(2, 0b00, 0xF2), 1, X86InstInfo{"ANDN", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_1ST_SRC, 0}}, {OPD(2, 0b00, 0xF3), 1, X86InstInfo{"", TYPE_VEX_GROUP_17, FLAGS_NONE, 0}}, // VEX Group 17 {OPD(2, 0b01, 0xF3), 1, X86InstInfo{"", TYPE_VEX_GROUP_17, FLAGS_NONE, 0}}, // VEX Group 17 {OPD(2, 0b10, 0xF3), 1, X86InstInfo{"", TYPE_VEX_GROUP_17, FLAGS_NONE, 0}}, // VEX Group 17 {OPD(2, 0b11, 0xF3), 1, X86InstInfo{"", TYPE_VEX_GROUP_17, FLAGS_NONE, 0}}, // VEX Group 17 {OPD(2, 0b00, 0xF5), 1, X86InstInfo{"BZHI", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_2ND_SRC, 0}}, // AMD reference manual is incorrect. PEXT actually maps to 0b10, not 0b01. {OPD(2, 0b10, 0xF5), 1, X86InstInfo{"PEXT", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_1ST_SRC, 0}}, {OPD(2, 0b11, 0xF5), 1, X86InstInfo{"PDEP", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_1ST_SRC, 0}}, {OPD(2, 0b11, 0xF6), 1, X86InstInfo{"MULX", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_1ST_SRC, 0}}, {OPD(2, 0b00, 0xF7), 1, X86InstInfo{"BEXTR", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_2ND_SRC, 0}}, {OPD(2, 0b01, 0xF7), 1, X86InstInfo{"SHLX", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_2ND_SRC, 0}}, {OPD(2, 0b10, 0xF7), 1, X86InstInfo{"SARX", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_2ND_SRC, 0}}, {OPD(2, 0b11, 0xF7), 1, X86InstInfo{"SHRX", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_2ND_SRC, 0}}, // VEX Map 3 {OPD(3, 0b01, 0x00), 1, X86InstInfo{"VPERMQ", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_1 | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x01), 1, X86InstInfo{"VPERMPD", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_1 | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x02), 1, X86InstInfo{"VPBLENDD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x04), 1, X86InstInfo{"VPERMILPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x05), 1, X86InstInfo{"VPERMILPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x06), 1, X86InstInfo{"VPERM2F128", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_REX_W_0 | FLAGS_XMM_FLAGS | FLAGS_VEX_L_1, 1}}, {OPD(3, 0b01, 0x08), 1, X86InstInfo{"VROUNDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x09), 1, X86InstInfo{"VROUNDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x0A), 1, X86InstInfo{"VROUNDSS", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x0B), 1, X86InstInfo{"VROUNDSD", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_64BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x0C), 1, X86InstInfo{"VBLENDPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x0D), 1, X86InstInfo{"VBLENDPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x0E), 1, X86InstInfo{"VPBLENDW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x0F), 1, X86InstInfo{"VPALIGNR", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x14), 1, X86InstInfo{"VPEXTRB", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x15), 1, X86InstInfo{"VPEXTRW", TYPE_INST, GenFlagsSizes(SIZE_16BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x16), 1, X86InstInfo{"VPEXTRD", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x17), 1, X86InstInfo{"VEXTRACTPS", TYPE_INST, GenFlagsSizes(SIZE_32BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_SF_MOD_DST | FLAGS_SF_DST_GPR | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x18), 1, X86InstInfo{"VINSERTF128", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x19), 1, X86InstInfo{"VEXTRACTF128", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_0 | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x1D), 1, X86InstInfo{"VCVTPS2PH", TYPE_INST, GenFlagsSizes(SIZE_64BIT, SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x20), 1, X86InstInfo{"VPINSRB", TYPE_INST, GenFlagsDstSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 1}}, {OPD(3, 0b01, 0x21), 1, X86InstInfo{"VINSERTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x22), 1, X86InstInfo{"VPINSR{D,Q}", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_SF_SRC_GPR, 1}}, {OPD(3, 0b01, 0x38), 1, X86InstInfo{"VINSERTI128", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x39), 1, X86InstInfo{"VEXTRACTI128", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_0 | FLAGS_SF_MOD_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x40), 1, X86InstInfo{"VDPPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x41), 1, X86InstInfo{"VDPPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(3, 0b01, 0x42), 1, X86InstInfo{"VMPSADBW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x44), 1, X86InstInfo{"VPCLMULQDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x46), 1, X86InstInfo{"VPERM2I128", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_L_1 | FLAGS_REX_W_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x4A), 1, X86InstInfo{"VBLENDVPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x4B), 1, X86InstInfo{"VBLENDVPD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x4C), 1, X86InstInfo{"VPBLENDVB", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_REX_W_0 | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b01, 0x5C), 1, X86InstInfo{"VFMADDSUBPS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x5D), 1, X86InstInfo{"VFMADDSUBPD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x5E), 1, X86InstInfo{"VFMSUBADDPS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x5F), 1, X86InstInfo{"VFMSUBADDPD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x60), 1, X86InstInfo{"VPCMPESTRM", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(3, 0b01, 0x61), 1, X86InstInfo{"VPCMPESTRI", TYPE_INST, GenFlagsSizes(SIZE_128BIT, SIZE_32BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(3, 0b01, 0x62), 1, X86InstInfo{"VPCMPISTRM", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(3, 0b01, 0x63), 1, X86InstInfo{"VPCMPISTRI", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_XMM_FLAGS | FLAGS_VEX_L_0, 1}}, {OPD(3, 0b01, 0x68), 1, X86InstInfo{"VFMADDPS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x69), 1, X86InstInfo{"VFMADDPD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x6A), 1, X86InstInfo{"VFMADDSS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x6B), 1, X86InstInfo{"VFMADDSD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x6C), 1, X86InstInfo{"VFMSUBPS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x6D), 1, X86InstInfo{"VFMSUBPD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x6E), 1, X86InstInfo{"VFMSUBSS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x6F), 1, X86InstInfo{"VFMSUBSD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x78), 1, X86InstInfo{"VFNMADDPS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x79), 1, X86InstInfo{"VFNMADDPD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x7A), 1, X86InstInfo{"VFNMADDSS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x7B), 1, X86InstInfo{"VFNMADDSD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x7C), 1, X86InstInfo{"VFNMSUBPS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x7D), 1, X86InstInfo{"VFNMSUBPD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x7E), 1, X86InstInfo{"VFNMSUBSS", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0x7F), 1, X86InstInfo{"VFNMSUBSD", TYPE_UNDEC, FLAGS_NONE, 0}}, ///< FMA4 {OPD(3, 0b01, 0xDF), 1, X86InstInfo{"VAESKEYGENASSIST", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_L_0 | FLAGS_XMM_FLAGS, 1}}, {OPD(3, 0b11, 0xF0), 1, X86InstInfo{"RORX", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_L_0, 1}}, // VEX Map 4 - 31 (Reserved) }; #undef OPD GenerateTable(Table.data(), VEXTable, std::size(VEXTable)); IR::InstallToTable(Table, IR::OpDispatch_VEXTable); IR::InstallToTable(Table, RuntimeTable); return Table; }; auto GroupTableLambda = [](const auto RuntimeTable) consteval { std::array Table{}; #define OPD(group, pp, opcode) (((group - TYPE_VEX_GROUP_12) << 4) | (pp << 3) | (opcode)) constexpr U8U8InfoStruct VEXGroupTable[] = { {OPD(TYPE_VEX_GROUP_12, 1, 0b010), 1, X86InstInfo{"VPSRLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_12, 1, 0b100), 1, X86InstInfo{"VPSRAW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_12, 1, 0b110), 1, X86InstInfo{"VPSLLW", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_13, 1, 0b010), 1, X86InstInfo{"VPSRLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_13, 1, 0b100), 1, X86InstInfo{"VPSRAD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_13, 1, 0b110), 1, X86InstInfo{"VPSLLD", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_14, 1, 0b010), 1, X86InstInfo{"VPSRLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_14, 1, 0b011), 1, X86InstInfo{"VPSRLDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_14, 1, 0b110), 1, X86InstInfo{"VPSLLQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_14, 1, 0b111), 1, X86InstInfo{"VPSLLDQ", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_SF_MOD_REG_ONLY | FLAGS_VEX_DST | FLAGS_XMM_FLAGS, 1}}, {OPD(TYPE_VEX_GROUP_15, 0, 0b010), 1, X86InstInfo{"VLDMXCSR", TYPE_INST, GenFlagsSameSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_L_0 | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_VEX_GROUP_15, 0, 0b011), 1, X86InstInfo{"VSTMXCSR", TYPE_INST, GenFlagsSameSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_VEX_L_0 | FLAGS_SF_MOD_MEM_ONLY, 0}}, {OPD(TYPE_VEX_GROUP_17, 0, 0b001), 1, X86InstInfo{"BLSR", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_DST, 0}}, {OPD(TYPE_VEX_GROUP_17, 0, 0b010), 1, X86InstInfo{"BLSMSK", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_DST, 0}}, {OPD(TYPE_VEX_GROUP_17, 0, 0b011), 1, X86InstInfo{"BLSI", TYPE_INST, FLAGS_MODRM | FLAGS_VEX_DST, 0}}, }; #undef OPD GenerateTable(Table.data(), VEXGroupTable, std::size(VEXGroupTable)); IR::InstallToTable(Table, IR::OpDispatch_VEXGroupTable); IR::InstallToTable(Table, RuntimeTable); return Table; }; const std::array VEXTableOps = BaseTableLambda(std::to_array(AVX256::BaseTable)); const std::array VEXTableGroupOps = GroupTableLambda(std::to_array(AVX256::TableGroupOps)); const std::array VEXTableOps_AVX128 = BaseTableLambda(std::to_array(AVX128::BaseTable)); const std::array VEXTableGroupOps_AVX128 = GroupTableLambda(std::to_array(AVX128::TableGroupOps)); } ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/X86Tables.h ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #pragma once #include #include #include #include #include #include #include namespace FEXCore::IR { class OpDispatchBuilder; } namespace FEXCore::X86Tables { struct X86InstInfo; namespace DecodeFlags { constexpr uint32_t FLAG_OPERAND_SIZE = (1 << 0); constexpr uint32_t FLAG_ADDRESS_SIZE = (1 << 1); constexpr uint32_t FLAG_LOCK = (1 << 2); constexpr uint32_t FLAG_LEGACY_PREFIX = (1 << 3); constexpr uint32_t FLAG_REX_PREFIX = (1 << 4); constexpr uint32_t FLAG_VSIB_BYTE = (1 << 5); constexpr uint32_t FLAG_OPTION_AVX_W = (1 << 6); constexpr uint32_t FLAG_REX_WIDENING = (1 << 7); constexpr uint32_t FLAG_REX_XGPR_B = (1 << 8); constexpr uint32_t FLAG_REX_XGPR_X = (1 << 9); constexpr uint32_t FLAG_REX_XGPR_R = (1 << 10); constexpr uint32_t FLAG_NO_PREFIX = (0b000 << 11); constexpr uint32_t FLAG_ES_PREFIX = (0b001 << 11); constexpr uint32_t FLAG_CS_PREFIX = (0b010 << 11); constexpr uint32_t FLAG_SS_PREFIX = (0b011 << 11); constexpr uint32_t FLAG_DS_PREFIX = (0b100 << 11); constexpr uint32_t FLAG_FS_PREFIX = (0b101 << 11); constexpr uint32_t FLAG_GS_PREFIX = (0b110 << 11); constexpr uint32_t FLAG_SEGMENTS = (0b111 << 11); constexpr uint32_t FLAG_FORCE_TSO = (1 << 14); constexpr uint32_t FLAG_DECODED_MODRM = (1 << 15); constexpr uint32_t FLAG_DECODED_SIB = (1 << 16); constexpr uint32_t FLAG_REP_PREFIX = (1 << 17); constexpr uint32_t FLAG_REPNE_PREFIX = (1 << 18); // Size flags constexpr uint32_t FLAG_SIZE_DST_OFF = 19; constexpr uint32_t FLAG_SIZE_SRC_OFF = FLAG_SIZE_DST_OFF + 3; constexpr uint32_t SIZE_MASK = 0b111; constexpr uint32_t SIZE_DEF = 0b000; // This should be invalid past decoding constexpr uint32_t SIZE_8BIT = 0b001; constexpr uint32_t SIZE_16BIT = 0b010; constexpr uint32_t SIZE_32BIT = 0b011; constexpr uint32_t SIZE_64BIT = 0b100; constexpr uint32_t SIZE_128BIT = 0b101; constexpr uint32_t SIZE_256BIT = 0b110; constexpr uint32_t FLAG_OPADDR_OFF = (FLAG_SIZE_SRC_OFF + 3); constexpr uint32_t FLAG_OPADDR_STACKSIZE = 4; // Two level deep stack constexpr uint32_t FLAG_OPADDR_FLAG_SIZE = 2; constexpr uint32_t FLAG_OPADDR_MASK = (((1 << FLAG_OPADDR_STACKSIZE) - 1) << FLAG_OPADDR_OFF); // 00 = NONE constexpr uint32_t FLAG_OPERAND_SIZE_LAST = 0b01; constexpr uint32_t FLAG_WIDENING_SIZE_LAST = 0b10; constexpr uint32_t GetSizeDstFlags(uint32_t Flags) { return (Flags >> FLAG_SIZE_DST_OFF) & SIZE_MASK; } constexpr uint32_t GetSizeSrcFlags(uint32_t Flags) { return (Flags >> FLAG_SIZE_SRC_OFF) & SIZE_MASK; } constexpr uint32_t GenSizeDstSize(uint32_t Size) { return Size << FLAG_SIZE_DST_OFF; } constexpr uint32_t GenSizeSrcSize(uint32_t Size) { return Size << FLAG_SIZE_SRC_OFF; } constexpr uint32_t GetOpAddr(uint32_t Flags, uint32_t Index) { return (((Flags & FLAG_OPADDR_MASK) >> FLAG_OPADDR_OFF) >> (Index * 2)) & ((1 << FLAG_OPADDR_FLAG_SIZE) - 1); } inline void PushOpAddr(uint32_t* Flags, uint32_t Flag) { uint32_t TmpFlags = *Flags; uint32_t BottomOfStack = ((TmpFlags & FLAG_OPADDR_MASK) >> FLAG_OPADDR_OFF) & ((1 << FLAG_OPADDR_FLAG_SIZE) - 1); TmpFlags &= ~(FLAG_OPADDR_MASK); TmpFlags |= (BottomOfStack << (FLAG_OPADDR_OFF + FLAG_OPADDR_FLAG_SIZE)) | (Flag << FLAG_OPADDR_OFF); *Flags = TmpFlags; } inline void PopOpAddrIf(uint32_t* Flags, uint32_t Flag) { uint32_t TmpFlags = *Flags; uint32_t BottomOfStack = ((TmpFlags & FLAG_OPADDR_MASK) >> FLAG_OPADDR_OFF) & ((1 << FLAG_OPADDR_FLAG_SIZE) - 1); // Only pop the stack if the bottom flag is the one we care about // Necessary for escape prefixes that overlap regular prefixes if (BottomOfStack != Flag) { return; } uint32_t TopOfStack = ((TmpFlags & FLAG_OPADDR_MASK) >> (FLAG_OPADDR_OFF + FLAG_OPADDR_FLAG_SIZE)) & ((1 << FLAG_OPADDR_FLAG_SIZE) - 1); TmpFlags &= ~(FLAG_OPADDR_MASK); TmpFlags |= (TopOfStack << FLAG_OPADDR_OFF); *Flags = TmpFlags; } } // namespace DecodeFlags struct DecodedOperand { enum class OpType : uint8_t { Nothing, GPR, GPRDirect, GPRIndirect, GPRIndirectRelocation, RIPRelative, RIPRelativeRelocation, Literal, LiteralRelocation, SIB, SIBRelocation }; bool IsNone() const { return Type == OpType::Nothing; } bool IsGPR() const { return Type == OpType::GPR; } bool IsGPRDirect() const { return Type == OpType::GPRDirect; } bool IsGPRIndirect() const { return Type == OpType::GPRIndirect; } bool IsGPRIndirectRelocation() const { return Type == OpType::GPRIndirectRelocation; } bool IsRIPRelative() const { return Type == OpType::RIPRelative; } bool IsRIPRelativeRelocation() const { return Type == OpType::RIPRelativeRelocation; } bool IsLiteral() const { return Type == OpType::Literal; } bool IsLiteralRelocation() const { return Type == OpType::LiteralRelocation; } bool IsSIB() const { return Type == OpType::SIB; } bool IsSIBRelocation() const { return Type == OpType::SIBRelocation; } uint64_t Literal() const { LOGMAN_THROW_A_FMT(IsLiteral(), "Precondition: must be a literal"); return Data.Literal.Value; } union TypeUnion { struct GPRType { bool HighBits; uint8_t GPR; auto operator<=>(const GPRType&) const = default; } GPR; struct { int64_t Displacement; uint8_t GPR; } GPRIndirect; // Shared with GPRIndirectRelocation struct { int64_t Value; } RIPLiteral; // Shared with RIPLiteralRelocation struct LiteralType { uint64_t Value; uint8_t Size; } Literal; struct { int64_t EntrypointOffset; } LiteralRelocation; struct { int64_t Offset; uint8_t Scale; uint8_t Index; // ~0 invalid uint8_t Base; // ~0 invalid } SIB; // Shared with SIBRelocation }; TypeUnion Data; OpType Type; }; struct DecodedInst { uint64_t PC; DecodedOperand Dest; DecodedOperand Src[3]; // Constains the dispatcher handler pointer const X86InstInfo* TableInfo; uint32_t Flags; uint16_t OP; uint8_t OPRaw; uint8_t ModRM; uint8_t SIB; uint8_t InstSize; int8_t REXIndex; }; union ModRMDecoded { uint8_t Hex {}; struct { uint8_t rm : 3; uint8_t reg : 3; uint8_t mod : 2; }; }; union SIBDecoded { uint8_t Hex {}; struct { uint8_t base : 3; uint8_t index : 3; uint8_t scale : 2; }; }; enum InstType { TYPE_UNKNOWN, TYPE_LEGACY_PREFIX, TYPE_PREFIX, TYPE_REX_PREFIX, TYPE_SECONDARY_TABLE_PREFIX, TYPE_X87_TABLE_PREFIX, TYPE_VEX_TABLE_PREFIX, TYPE_INST, TYPE_X87 = TYPE_INST, TYPE_INVALID, TYPE_COPY_OTHER, // Changes `X86InstInfo::OpcodeDispatcher` member to use the `Indirect` version. // Points to a 2 member array of X86InstInfo to choose instruction description based on executing bitness. TYPE_ARCH_DISPATCHER, // Must be in order // Groups 1, 1a, 2, 3, 4, 5, 11 are for the primary op table // Groups 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, p are for the secondary op table TYPE_GROUP_1, TYPE_GROUP_1A, TYPE_GROUP_2, TYPE_GROUP_3, TYPE_GROUP_4, TYPE_GROUP_5, TYPE_GROUP_11, // Must be in order // Groups 6-p Are for the secondary op table TYPE_GROUP_6, TYPE_GROUP_7, TYPE_GROUP_8, TYPE_GROUP_9, TYPE_GROUP_10, TYPE_GROUP_12, TYPE_GROUP_13, TYPE_GROUP_14, TYPE_GROUP_15, TYPE_GROUP_16, TYPE_GROUP_17, TYPE_GROUP_P, // The secondary op extension table allows further extensions // Group 7 allows additional extensions to this table TYPE_SECOND_GROUP_MODRM, TYPE_VEX_GROUP_12, TYPE_VEX_GROUP_13, TYPE_VEX_GROUP_14, TYPE_VEX_GROUP_15, TYPE_VEX_GROUP_17, TYPE_GROUP_EVEX, // Exists in the table but isn't decoded correctly TYPE_UNDEC = TYPE_INVALID, TYPE_MMX = TYPE_INVALID, TYPE_PRIV = TYPE_INVALID, TYPE_0F38_TABLE = TYPE_INVALID, TYPE_0F3A_TABLE = TYPE_INVALID, TYPE_3DNOW_TABLE = TYPE_INVALID, }; namespace InstFlags { using InstFlagType = uint64_t; constexpr InstFlagType FLAGS_NONE = 0; // The secondary Opcode Map uses prefix bytes to overlay more instruction // But some instructions need to ignore this overlay and consume these prefixes. constexpr InstFlagType FLAGS_NO_OVERLAY = (1ULL << 0); // Some instructions partially ignore overlay // Ignore OpSize (0x66) in this case constexpr InstFlagType FLAGS_NO_OVERLAY66 = (1ULL << 1); constexpr InstFlagType FLAGS_DEBUG_MEM_ACCESS = (1ULL << 2); // Only SEXT if the instruction is operating in 64bit operand size constexpr InstFlagType FLAGS_SRC_SEXT64BIT = (1ULL << 3); constexpr InstFlagType FLAGS_BLOCK_END = (1ULL << 4); constexpr InstFlagType FLAGS_SETS_RIP = (1ULL << 5); constexpr InstFlagType FLAGS_DISPLACE_SIZE_MUL_2 = (1ULL << 6); constexpr InstFlagType FLAGS_DISPLACE_SIZE_DIV_2 = (1ULL << 7); constexpr InstFlagType FLAGS_SRC_SEXT = (1ULL << 8); constexpr InstFlagType FLAGS_MEM_OFFSET = (1ULL << 9); // Enables XMM based subflags // Current reserved range for this SF is [10, 15] constexpr InstFlagType FLAGS_XMM_FLAGS = (1ULL << 10); // X87 flags aliased to XMM flags selection // Allows X87 instruction table that is abusing the flag for 64BIT selection to work constexpr InstFlagType FLAGS_X87_FLAGS = (1ULL << 10); // Non-XMM subflags constexpr InstFlagType FLAGS_SF_DST_RAX = (1ULL << 11); constexpr InstFlagType FLAGS_SF_DST_RDX = (1ULL << 12); constexpr InstFlagType FLAGS_SF_SRC_RAX = (1ULL << 13); constexpr InstFlagType FLAGS_SF_SRC_RCX = (1ULL << 14); constexpr InstFlagType FLAGS_SF_REX_IN_BYTE = (1ULL << 15); // XMM subflags constexpr InstFlagType FLAGS_SF_UNUSED = (1ULL << 11); // No assigned behavior yet constexpr InstFlagType FLAGS_SF_DST_GPR = (1ULL << 12); constexpr InstFlagType FLAGS_SF_SRC_GPR = (1ULL << 13); constexpr InstFlagType FLAGS_SF_MMX_DST = (1ULL << 14); constexpr InstFlagType FLAGS_SF_MMX_SRC = (1ULL << 15); constexpr InstFlagType FLAGS_SF_MMX = FLAGS_SF_MMX_DST | FLAGS_SF_MMX_SRC; // Enables MODRM specific subflags // Current reserved range for this SF is [14, 17] constexpr InstFlagType FLAGS_MODRM = (1ULL << 16); // With ModRM SF flag enabled // Direction of ModRM. Dst ^ Src // Set means destination is rm bits // Unset means src is rm bits constexpr InstFlagType FLAGS_SF_MOD_DST = (1ULL << 17); // If the instruction is restricted to mem or reg only // 0b00 = Regular ModRM support // 0b01 = Memory accesses only // 0b10 = Register accesses only // 0b11 = constexpr InstFlagType FLAGS_SF_MOD_MEM_ONLY = (1ULL << 18); constexpr InstFlagType FLAGS_SF_MOD_REG_ONLY = (1ULL << 19); constexpr InstFlagType FLAGS_SF_MOD_ZERO_REG = (1ULL << 20); // x87 constexpr InstFlagType FLAGS_POP = (1ULL << 21); // Whether or not the instruction has a VEX prefix for the dest, first, or second source. constexpr InstFlagType FLAGS_VEX_SRC_MASK = (0b11ULL << 22); constexpr InstFlagType FLAGS_VEX_NO_OPERAND = (0b00ULL << 22); constexpr InstFlagType FLAGS_VEX_DST = (0b01ULL << 22); constexpr InstFlagType FLAGS_VEX_1ST_SRC = (0b10ULL << 22); constexpr InstFlagType FLAGS_VEX_2ND_SRC = (0b11ULL << 22); // Whether or not the instruction has a VSIB byte constexpr InstFlagType FLAGS_VEX_VSIB = (1ULL << 24); constexpr InstFlagType FLAGS_VEX_L_IGNORE = (1ULL << 25); constexpr InstFlagType FLAGS_VEX_L_0 = (1ULL << 26); constexpr InstFlagType FLAGS_VEX_L_1 = (1ULL << 27); constexpr InstFlagType FLAGS_REX_W_0 = (1ULL << 28); constexpr InstFlagType FLAGS_REX_W_1 = (1ULL << 29); constexpr InstFlagType FLAGS_CALL = (1ULL << 30); constexpr InstFlagType FLAGS_SIZE_DST_OFF = 58; constexpr InstFlagType FLAGS_SIZE_SRC_OFF = FLAGS_SIZE_DST_OFF + 3; constexpr InstFlagType SIZE_MASK = 0b111; constexpr InstFlagType SIZE_DEF = 0b000; constexpr InstFlagType SIZE_8BIT = 0b001; constexpr InstFlagType SIZE_16BIT = 0b010; constexpr InstFlagType SIZE_32BIT = 0b011; constexpr InstFlagType SIZE_64BIT = 0b100; constexpr InstFlagType SIZE_128BIT = 0b101; constexpr InstFlagType SIZE_256BIT = 0b110; constexpr InstFlagType SIZE_64BITDEF = 0b111; // Default mode is 64bit instead of typical 32bit #ifndef _WIN32 constexpr uint32_t DEFAULT_SYSCALL_FLAGS = FLAGS_NO_OVERLAY; #else // Syscall ends a block on WIN32 because the instruction can update the CPU's RIP. constexpr uint32_t DEFAULT_SYSCALL_FLAGS = FLAGS_NO_OVERLAY | FLAGS_BLOCK_END; #endif constexpr InstFlagType GetSizeDstFlags(InstFlagType Flags) { return (Flags >> FLAGS_SIZE_DST_OFF) & SIZE_MASK; } constexpr InstFlagType GetSizeSrcFlags(InstFlagType Flags) { return (Flags >> FLAGS_SIZE_SRC_OFF) & SIZE_MASK; } constexpr InstFlagType GenFlagsDstSize(InstFlagType Size) { return Size << FLAGS_SIZE_DST_OFF; } constexpr InstFlagType GenFlagsSrcSize(InstFlagType Size) { return Size << FLAGS_SIZE_SRC_OFF; } constexpr InstFlagType GenFlagsSameSize(InstFlagType Size) { return (Size << FLAGS_SIZE_DST_OFF) | (Size << FLAGS_SIZE_SRC_OFF); } constexpr InstFlagType GenFlagsSizes(InstFlagType Dest, InstFlagType Src) { return (Dest << FLAGS_SIZE_DST_OFF) | (Src << FLAGS_SIZE_SRC_OFF); } // If it has an xmm subflag #define HAS_XMM_SUBFLAG(x, flag) \ (((x) & (FEXCore::X86Tables::InstFlags::FLAGS_XMM_FLAGS | (flag))) == (FEXCore::X86Tables::InstFlags::FLAGS_XMM_FLAGS | (flag))) // If it has non-xmm subflag #define HAS_NON_XMM_SUBFLAG(x, flag) (((x) & (FEXCore::X86Tables::InstFlags::FLAGS_XMM_FLAGS | (flag))) == (flag)) } // namespace InstFlags constexpr uint8_t OpToIndex(uint8_t Op) { switch (Op) { // Group 1 case 0x80: return 0; case 0x81: return 1; case 0x82: return 2; case 0x83: return 3; // Group 2 case 0xC0: return 0; case 0xC1: return 1; case 0xD0: return 2; case 0xD1: return 3; case 0xD2: return 4; case 0xD3: return 5; // Group 3 case 0xF6: return 0; case 0xF7: return 1; // Group 4 case 0xFE: return 0; // Group 5 case 0xFF: return 0; // Group 11 case 0xC6: return 0; case 0xC7: return 1; } return 0; } using DecodedOp = const DecodedInst*; using OpDispatchPtr = void (IR::OpDispatchBuilder::*)(DecodedOp); union OpDispatchPtrWrapper { OpDispatchPtr OpDispatch; const struct X86InstInfo* Indirect; }; struct X86InstInfo { const char* Name; InstType Type; InstFlags::InstFlagType Flags; ///< Must be larger than InstFlags enum uint8_t MoreBytes; OpDispatchPtrWrapper OpcodeDispatcher; bool operator==(const X86InstInfo& b) const { if (strcmp(Name, b.Name) != 0 || Type != b.Type || Flags != b.Flags || MoreBytes != b.MoreBytes) { return false; } // We don't care if the opcode dispatcher differs return true; } bool operator!=(const X86InstInfo& b) const { return !operator==(b); } }; static_assert(std::is_trivially_copyable_v); constexpr size_t MAX_PRIMARY_TABLE_SIZE = 256; constexpr size_t MAX_SECOND_TABLE_SIZE = 256; constexpr size_t MAX_REP_MOD_TABLE_SIZE = 256; constexpr size_t MAX_REPNE_MOD_TABLE_SIZE = 256; constexpr size_t MAX_OPSIZE_MOD_TABLE_SIZE = 256; // 6 (groups) | 6 (max indexes) | 8 ops = 0b111'111'111 = 9 bits constexpr size_t MAX_INST_GROUP_TABLE_SIZE = 512; // 12 (groups) | 3(max indexes) | 8 ops = 0b1111'11'111 = 9 bits constexpr size_t MAX_INST_SECOND_GROUP_TABLE_SIZE = 512; constexpr size_t MAX_X87_TABLE_SIZE = 1 << 11; constexpr size_t MAX_SECOND_MODRM_TABLE_SIZE = 32; // (3 bit prefixes) | 8 bit opcode constexpr size_t MAX_0F_38_TABLE_SIZE = (1 << 11); // 1 REX | 1 prefixes | 8 bit opcode constexpr size_t MAX_0F_3A_TABLE_SIZE = (1 << 11); constexpr size_t MAX_3DNOW_TABLE_SIZE = 256; // VEX // map_select(2 bits for now) | vex.pp (2 bits) | opcode (8bit) constexpr size_t MAX_VEX_TABLE_SIZE = (1 << 13); // VEX group ops // group select (3 bits for now) | ModRM opcode (3 bits) constexpr size_t MAX_VEX_GROUP_TABLE_SIZE = (1 << 7); extern const std::array BaseOps; extern const std::array SecondBaseOps; extern const std::array RepModOps; extern const std::array RepNEModOps; extern const std::array OpSizeModOps; extern const std::array PrimaryInstGroupOps; extern const std::array SecondInstGroupOps; extern const std::array SecondModRMTableOps; extern const std::array X87F80Ops; extern const std::array X87F64Ops; extern const std::array DDDNowOps; extern const std::array H0F38TableOps; extern const std::array H0F3ATableOps; // VEX extern const std::array VEXTableOps; extern const std::array VEXTableGroupOps; extern const std::array VEXTableOps_AVX128; extern const std::array VEXTableGroupOps_AVX128; template struct X86TablesInfoStruct { OpcodeType first; uint8_t second; X86InstInfo Info; }; using U8U8InfoStruct = X86TablesInfoStruct; using U16U8InfoStruct = X86TablesInfoStruct; template constexpr static inline void GenerateTable(X86InstInfo* FinalTable, const X86TablesInfoStruct* LocalTable, size_t TableSize) { for (size_t j = 0; j < TableSize; ++j) { const X86TablesInfoStruct& Op = LocalTable[j]; auto OpNum = Op.first; const X86InstInfo& Info = Op.Info; for (uint32_t i = 0; i < Op.second; ++i) { if (FinalTable[OpNum + i].Type != TYPE_UNKNOWN) { LOGMAN_MSG_A_FMT("Duplicate Entry {}->{}", FinalTable[OpNum + i].Name, Info.Name); } if (FinalTable[OpNum + i].OpcodeDispatcher.OpDispatch) { LOGMAN_MSG_A_FMT("Already installed an OpcodeDispatcher for 0x{:x}", OpNum + i); } FinalTable[OpNum + i] = Info; } } }; template constexpr static inline void GenerateTableWithCopy(X86InstInfo* FinalTable, const X86TablesInfoStruct* LocalTable, size_t TableSize, const X86InstInfo* OtherLocal) { for (size_t j = 0; j < TableSize; ++j) { const X86TablesInfoStruct& Op = LocalTable[j]; auto OpNum = Op.first; const X86InstInfo& Info = Op.Info; for (uint32_t i = 0; i < Op.second; ++i) { if (FinalTable[OpNum + i].Type != TYPE_UNKNOWN) { LOGMAN_MSG_A_FMT("Duplicate Entry {}->{}", FinalTable[OpNum + i].Name, Info.Name); } if (Info.Type == TYPE_COPY_OTHER) { FinalTable[OpNum + i] = OtherLocal[OpNum + i]; } else { FinalTable[OpNum + i] = Info; } } } }; template constexpr static inline void GenerateX87Table(X86InstInfo* FinalTable, const X86TablesInfoStruct* LocalTable, size_t TableSize) { for (size_t j = 0; j < TableSize; ++j) { const X86TablesInfoStruct& Op = LocalTable[j]; auto OpNum = Op.first; const X86InstInfo& Info = Op.Info; for (uint32_t i = 0; i < Op.second; ++i) { if (FinalTable[OpNum + i].Type != TYPE_UNKNOWN) { LOGMAN_MSG_A_FMT("Duplicate Entry {}->{}", FinalTable[OpNum + i].Name, Info.Name); } if ((OpNum & 0b11'000'000) == 0b11'000'000) { // If the mod field is 0b11 then it is a regular op FinalTable[OpNum + i] = Info; } else { // If the mod field is !0b11 then this instruction is duplicated through the whole mod [0b00, 0b10] range // and the modrm.rm space because that is used part of the instruction encoding if ((OpNum & 0b11'000'000) != 0) { ERROR_AND_DIE_FMT("Only support mod field of zero in this path"); } for (uint16_t mod = 0b00'000'000; mod < 0b11'000'000; mod += 0b01'000'000) { for (uint16_t rm = 0b000; rm < 0b1'000; ++rm) { FinalTable[(OpNum | mod | rm) + i] = Info; } } } } } }; FEX_DEFINE_ENUM_FMT_PASSTHROUGH(FEXCore::X86Tables::DecodedOperand::OpType); } // namespace FEXCore::X86Tables ================================================ FILE: FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: frontend|x86-tables $end_info$ */ #include "Interface/Core/X86Tables/X86Tables.h" #include "Interface/Core/OpcodeDispatcher.h" #include namespace FEXCore::X86Tables { using namespace InstFlags; using namespace IR; // Top bit indicating if it needs to be repeated with {0x40, 0x80} or'd in // All OPDReg versions need it #define OPDReg(op, reg) ((1 << 15) | ((op - 0xD8) << 8) | (reg << 3)) #define OPD(op, modrmop) (((op - 0xD8) << 8) | modrmop) constexpr std::array X87F64OpTable = {{ {OPDReg(0xD8, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::i32Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::i32Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i32Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xD8, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i32Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xD8, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i32Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i32Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i32Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i32Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xC8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xD8, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xD8, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xF8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD9, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64, OpSize::i32Bit>}, // 1 = Invalid {OPDReg(0xD9, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i32Bit>}, {OPDReg(0xD9, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i32Bit>}, {OPDReg(0xD9, 4) | 0x00, 8, &OpDispatchBuilder::X87LDENVF64}, {OPDReg(0xD9, 5) | 0x00, 8, &OpDispatchBuilder::X87FLDCWF64}, {OPDReg(0xD9, 6) | 0x00, 8, &OpDispatchBuilder::X87FNSTENV}, {OPDReg(0xD9, 7) | 0x00, 8, &OpDispatchBuilder::X87FSTCW}, {OPD(0xD9, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDFromStack>}, {OPD(0xD9, 0xC8), 8, &OpDispatchBuilder::FXCH}, {OPD(0xD9, 0xD0), 1, &OpDispatchBuilder::NOPOp}, // FNOP // D1 = Invalid // D8 = Invalid {OPD(0xD9, 0xE0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80STACKCHANGESIGN, false>}, {OPD(0xD9, 0xE1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80STACKABS, false>}, // E2 = Invalid {OPD(0xD9, 0xE4), 1, &OpDispatchBuilder::FTSTF64}, {OPD(0xD9, 0xE5), 1, &OpDispatchBuilder::X87FXAM}, // E6 = Invalid {OPD(0xD9, 0xE8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0x3FF0000000000000>}, // 1.0 {OPD(0xD9, 0xE9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0x400A934F0979A372>}, // log2l(10) {OPD(0xD9, 0xEA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0x3FF71547652B82FE>}, // log2l(e) {OPD(0xD9, 0xEB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0x400921FB54442D18>}, // pi {OPD(0xD9, 0xEC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0x3FD34413509F79FF>}, // log10l(2) {OPD(0xD9, 0xED), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0x3FE62E42FEFA39EF>}, // log(2) {OPD(0xD9, 0xEE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64_Const, 0>}, // 0.0 // EF = Invalid {OPD(0xD9, 0xF0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80F2XM1STACK, false>}, {OPD(0xD9, 0xF1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87FYL2X, false>}, {OPD(0xD9, 0xF2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80PTANSTACK, true>}, {OPD(0xD9, 0xF3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80ATANSTACK, false>}, {OPD(0xD9, 0xF4), 1, &OpDispatchBuilder::X87FXTRACTF64}, {OPD(0xD9, 0xF5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80FPREM1STACK, true>}, {OPD(0xD9, 0xF6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87ModifySTP, false>}, {OPD(0xD9, 0xF7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87ModifySTP, true>}, {OPD(0xD9, 0xF8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80FPREMSTACK, true>}, {OPD(0xD9, 0xF9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87FYL2X, true>}, {OPD(0xD9, 0xFA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SQRTSTACK, false>}, {OPD(0xD9, 0xFB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SINCOSSTACK, true>}, {OPD(0xD9, 0xFC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80ROUNDSTACK, false>}, {OPD(0xD9, 0xFD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SCALESTACK, false>}, {OPD(0xD9, 0xFE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SINSTACK, true>}, {OPD(0xD9, 0xFF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80COSSTACK, true>}, {OPDReg(0xDA, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::i32Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::i32Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i32Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDA, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i32Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDA, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i32Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i32Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i32Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i32Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xDA, 0xC0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDA, 0xC8), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDA, 0xD0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDA, 0xD8), 8, &OpDispatchBuilder::X87FCMOV}, // E0 = Invalid // E8 = Invalid {OPD(0xDA, 0xE9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, true>}, // EA = Invalid // F0 = Invalid // F8 = Invalid {OPDReg(0xDB, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64}, {OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>}, {OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, {OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, // 4 = Invalid {OPDReg(0xDB, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64, OpSize::f80Bit>}, // 6 = Invalid {OPDReg(0xDB, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::f80Bit>}, {OPD(0xDB, 0xC0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDB, 0xC8), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDB, 0xD0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDB, 0xD8), 8, &OpDispatchBuilder::X87FCMOV}, // E0 = Invalid {OPD(0xDB, 0xE2), 1, &OpDispatchBuilder::FNCLEX}, {OPD(0xDB, 0xE3), 1, &OpDispatchBuilder::FNINIT}, // E4 = Invalid {OPD(0xDB, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, {OPD(0xDB, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, // F8 = Invalid {OPDReg(0xDC, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::i64Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::i64Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i64Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDC, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i64Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDC, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i64Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i64Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i64Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i64Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xDC, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xC8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDC, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDC, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xF8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPDReg(0xDD, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64, OpSize::i64Bit>}, {OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>}, {OPDReg(0xDD, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>}, {OPDReg(0xDD, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>}, {OPDReg(0xDD, 4) | 0x00, 8, &OpDispatchBuilder::X87FRSTOR}, // 5 = Invalid {OPDReg(0xDD, 6) | 0x00, 8, &OpDispatchBuilder::X87FNSAVE}, {OPDReg(0xDD, 7) | 0x00, 8, &OpDispatchBuilder::X87FNSTSW}, {OPD(0xDD, 0xC0), 8, &OpDispatchBuilder::X87FFREE}, {OPD(0xDD, 0xC8), 8, &OpDispatchBuilder::FXCH}, {OPD(0xDD, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, // register-register from regular X87 {OPD(0xDD, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, //^ {OPD(0xDD, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDD, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDE, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::i16Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::i16Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i16Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDE, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::i16Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDE, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i16Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::i16Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i16Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::i16Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xDE, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADDF64, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xC8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMULF64, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDE, 0xD9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, true>}, {OPD(0xDE, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUBF64, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xF8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIVF64, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPDReg(0xDF, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64}, {OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>}, {OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, {OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, {OPDReg(0xDF, 4) | 0x00, 8, &OpDispatchBuilder::FBLDF64}, {OPDReg(0xDF, 5) | 0x00, 8, &OpDispatchBuilder::FILDF64}, {OPDReg(0xDF, 6) | 0x00, 8, &OpDispatchBuilder::FBSTPF64}, {OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, // XXX: This should also set the x87 tag bits to empty // We don't support this currently, so just pop the stack {OPD(0xDF, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87ModifySTP, true>}, {OPD(0xDF, 0xC8), 8, &OpDispatchBuilder::FXCH}, {OPD(0xDF, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, {OPD(0xDF, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, {OPD(0xDF, 0xE0), 8, &OpDispatchBuilder::X87FNSTSW}, {OPD(0xDF, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, {OPD(0xDF, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, }}; constexpr std::array X87F80OpTable = {{ {OPDReg(0xD8, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::i32Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::i32Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i32Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xD8, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i32Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xD8, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i32Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i32Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i32Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD8, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i32Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xC8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xD8, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xD8, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xD8, 0xF8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xD9, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD, OpSize::i32Bit>}, // 1 = Invalid {OPDReg(0xD9, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i32Bit>}, {OPDReg(0xD9, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i32Bit>}, {OPDReg(0xD9, 4) | 0x00, 8, &OpDispatchBuilder::X87LDENV}, {OPDReg(0xD9, 5) | 0x00, 8, &OpDispatchBuilder::X87FLDCW}, // XXX: stubbed FLDCW {OPDReg(0xD9, 6) | 0x00, 8, &OpDispatchBuilder::X87FNSTENV}, {OPDReg(0xD9, 7) | 0x00, 8, &OpDispatchBuilder::X87FSTCW}, {OPD(0xD9, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDFromStack>}, {OPD(0xD9, 0xC8), 8, &OpDispatchBuilder::FXCH}, {OPD(0xD9, 0xD0), 1, &OpDispatchBuilder::NOPOp}, // FNOP // D1 = Invalid // D8 = Invalid {OPD(0xD9, 0xE0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80STACKCHANGESIGN, false>}, {OPD(0xD9, 0xE1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80STACKABS, false>}, // E2 = Invalid {OPD(0xD9, 0xE4), 1, &OpDispatchBuilder::FTST}, {OPD(0xD9, 0xE5), 1, &OpDispatchBuilder::X87FXAM}, // E6 = Invalid {OPD(0xD9, 0xE8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_X87_ONE>}, // 1.0 {OPD(0xD9, 0xE9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_X87_LOG2_10>}, // log2l(10) {OPD(0xD9, 0xEA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_X87_LOG2_E>}, // log2l(e) {OPD(0xD9, 0xEB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_X87_PI>}, // pi {OPD(0xD9, 0xEC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_X87_LOG10_2>}, // log10l(2) {OPD(0xD9, 0xED), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_X87_LOG_2>}, // log(2) {OPD(0xD9, 0xEE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD_Const, NamedVectorConstant::NAMED_VECTOR_ZERO>}, // 0.0 // EF = Invalid {OPD(0xD9, 0xF0), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80F2XM1STACK, false>}, {OPD(0xD9, 0xF1), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87FYL2X, false>}, {OPD(0xD9, 0xF2), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80PTANSTACK, true>}, {OPD(0xD9, 0xF3), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80ATANSTACK, false>}, {OPD(0xD9, 0xF4), 1, &OpDispatchBuilder::X87FXTRACT}, {OPD(0xD9, 0xF5), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80FPREM1STACK, true>}, {OPD(0xD9, 0xF6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87ModifySTP, false>}, {OPD(0xD9, 0xF7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87ModifySTP, true>}, {OPD(0xD9, 0xF8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80FPREMSTACK, true>}, {OPD(0xD9, 0xF9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87FYL2X, true>}, {OPD(0xD9, 0xFA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SQRTSTACK, false>}, {OPD(0xD9, 0xFB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SINCOSSTACK, true>}, {OPD(0xD9, 0xFC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80ROUNDSTACK, false>}, {OPD(0xD9, 0xFD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SCALESTACK, false>}, {OPD(0xD9, 0xFE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80SINSTACK, true>}, {OPD(0xD9, 0xFF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87OpHelper, OP_F80COSSTACK, true>}, {OPDReg(0xDA, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::i32Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::i32Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i32Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDA, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i32Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDA, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i32Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i32Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i32Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDA, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i32Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xDA, 0xC0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDA, 0xC8), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDA, 0xD0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDA, 0xD8), 8, &OpDispatchBuilder::X87FCMOV}, // E0 = Invalid // E8 = Invalid {OPD(0xDA, 0xE9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, true>}, // EA = Invalid // F0 = Invalid // F8 = Invalid {OPDReg(0xDB, 0) | 0x00, 8, &OpDispatchBuilder::FILD}, {OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>}, {OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, {OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, // 4 = Invalid {OPDReg(0xDB, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD, OpSize::f80Bit>}, // 6 = Invalid {OPDReg(0xDB, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::f80Bit>}, {OPD(0xDB, 0xC0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDB, 0xC8), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDB, 0xD0), 8, &OpDispatchBuilder::X87FCMOV}, {OPD(0xDB, 0xD8), 8, &OpDispatchBuilder::X87FCMOV}, // E0 = Invalid {OPD(0xDB, 0xE2), 1, &OpDispatchBuilder::FNCLEX}, {OPD(0xDB, 0xE3), 1, &OpDispatchBuilder::FNINIT}, // E4 = Invalid {OPD(0xDB, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, {OPD(0xDB, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, // F8 = Invalid {OPDReg(0xDC, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::i64Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::i64Bit, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i64Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDC, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i64Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDC, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i64Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i64Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i64Bit, false, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDC, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i64Bit, false, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xDC, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xC8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDC, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDC, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDC, 0xF8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPDReg(0xDD, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLD, OpSize::i64Bit>}, {OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>}, {OPDReg(0xDD, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>}, {OPDReg(0xDD, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>}, {OPDReg(0xDD, 4) | 0x00, 8, &OpDispatchBuilder::X87FRSTOR}, // 5 = Invalid {OPDReg(0xDD, 6) | 0x00, 8, &OpDispatchBuilder::X87FNSAVE}, {OPDReg(0xDD, 7) | 0x00, 8, &OpDispatchBuilder::X87FNSTSW}, {OPD(0xDD, 0xC0), 8, &OpDispatchBuilder::X87FFREE}, {OPD(0xDD, 0xC8), 8, &OpDispatchBuilder::FXCH}, {OPD(0xDD, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, {OPD(0xDD, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, {OPD(0xDD, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDD, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDE, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::i16Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::i16Bit, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i16Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDE, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::i16Bit, true, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPDReg(0xDE, 4) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i16Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 5) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::i16Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 6) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i16Bit, true, false, OpDispatchBuilder::OpResult::RES_ST0>}, {OPDReg(0xDE, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::i16Bit, true, true, OpDispatchBuilder::OpResult::RES_ST0>}, {OPD(0xDE, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FADD, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xC8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FMUL, OpSize::f80Bit, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, false>}, {OPD(0xDE, 0xD9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_X87, true>}, {OPD(0xDE, 0xE0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSUB, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::f80Bit, false, true, OpDispatchBuilder::OpResult::RES_STI>}, {OPD(0xDE, 0xF8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FDIV, OpSize::f80Bit, false, false, OpDispatchBuilder::OpResult::RES_STI>}, {OPDReg(0xDF, 0) | 0x00, 8, &OpDispatchBuilder::FILD}, {OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>}, {OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, {OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, {OPDReg(0xDF, 4) | 0x00, 8, &OpDispatchBuilder::FBLD}, {OPDReg(0xDF, 5) | 0x00, 8, &OpDispatchBuilder::FILD}, {OPDReg(0xDF, 6) | 0x00, 8, &OpDispatchBuilder::FBSTP}, {OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, // XXX: This should also set the x87 tag bits to empty // We don't support this currently, so just pop the stack {OPD(0xDF, 0xC0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::X87ModifySTP, true>}, {OPD(0xDF, 0xC8), 8, &OpDispatchBuilder::FXCH}, {OPD(0xDF, 0xD0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, {OPD(0xDF, 0xD8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTToStack>}, {OPD(0xDF, 0xE0), 8, &OpDispatchBuilder::X87FNSTSW}, {OPD(0xDF, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, {OPD(0xDF, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMI, OpSize::f80Bit, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>}, }}; #undef OPD #undef OPDReg auto GenerateX87TableLambda = [](const auto DispatchTable) consteval { #define OPD(op, modrmop) (((op - 0xD8) << 8) | modrmop) #define OPDReg(op, reg) (((op - 0xD8) << 8) | (reg << 3)) std::array Table{}; constexpr U16U8InfoStruct X87OpTable[] = { // 0xD8 {OPDReg(0xD8, 0), 1, X86InstInfo{"FADD", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD8, 1), 1, X86InstInfo{"FMUL", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD8, 2), 1, X86InstInfo{"FCOM", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD8, 3), 1, X86InstInfo{"FCOMP", TYPE_X87, FLAGS_MODRM | FLAGS_POP, 0}}, {OPDReg(0xD8, 4), 1, X86InstInfo{"FSUB", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD8, 5), 1, X86InstInfo{"FSUBR", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD8, 6), 1, X86InstInfo{"FDIV", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD8, 7), 1, X86InstInfo{"FDIVR", TYPE_X87, FLAGS_MODRM, 0}}, // / 0 {OPD(0xD8, 0xC0), 8, X86InstInfo{"FADD", TYPE_X87, FLAGS_NONE, 0}}, // / 1 {OPD(0xD8, 0xC8), 8, X86InstInfo{"FMUL", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xD8, 0xD0), 8, X86InstInfo{"FCOM", TYPE_X87, FLAGS_NONE, 0}}, // / 3 {OPD(0xD8, 0xD8), 8, X86InstInfo{"FCOMP", TYPE_X87, FLAGS_POP, 0}}, // / 4 {OPD(0xD8, 0xE0), 8, X86InstInfo{"FSUB", TYPE_X87, FLAGS_NONE, 0}}, // / 5 {OPD(0xD8, 0xE8), 8, X86InstInfo{"FSUBR", TYPE_X87, FLAGS_NONE, 0}}, // / 6 {OPD(0xD8, 0xF0), 8, X86InstInfo{"FDIV", TYPE_X87, FLAGS_NONE, 0}}, // / 7 {OPD(0xD8, 0xF8), 8, X86InstInfo{"FDIVR", TYPE_X87, FLAGS_NONE, 0}}, // 0xD9 {OPDReg(0xD9, 0), 1, X86InstInfo{"FLD", TYPE_INST, FLAGS_MODRM, 0}}, {OPDReg(0xD9, 1), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPDReg(0xD9, 2), 1, X86InstInfo{"FST", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPDReg(0xD9, 3), 1, X86InstInfo{"FSTP", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xD9, 4), 1, X86InstInfo{"FLDENV", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xD9, 5), 1, X86InstInfo{"FLDCW", TYPE_X87, GenFlagsSameSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xD9, 6), 1, X86InstInfo{"FNSTENV", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPDReg(0xD9, 7), 1, X86InstInfo{"FNSTCW", TYPE_INST, GenFlagsSameSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, // / 0 {OPD(0xD9, 0xC0), 8, X86InstInfo{"FLD", TYPE_INST, FLAGS_NONE, 0}}, // / 1 {OPD(0xD9, 0xC8), 8, X86InstInfo{"FXCH", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xD9, 0xD0), 1, X86InstInfo{"FNOP", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xD1), 7, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 3 {OPD(0xD9, 0xD8), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 4 {OPD(0xD9, 0xE0), 1, X86InstInfo{"FCHS", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xE1), 1, X86InstInfo{"FABS", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xE2), 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(0xD9, 0xE4), 1, X86InstInfo{"FTST", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xE5), 1, X86InstInfo{"FXAM", TYPE_INST, FLAGS_NONE, 0}}, {OPD(0xD9, 0xE6), 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 5 {OPD(0xD9, 0xE8), 1, X86InstInfo{"FLD1", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xE9), 1, X86InstInfo{"FLDL2T", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xEA), 1, X86InstInfo{"FLDL2E", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xEB), 1, X86InstInfo{"FLDPI", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xEC), 1, X86InstInfo{"FLDLG2", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xED), 1, X86InstInfo{"FLDLN2", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xEE), 1, X86InstInfo{"FLDZ", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xEF), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 6 {OPD(0xD9, 0xF0), 1, X86InstInfo{"F2XM1", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF1), 1, X86InstInfo{"FYL2X", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF2), 1, X86InstInfo{"FPTAN", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF3), 1, X86InstInfo{"FPATAN", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF4), 1, X86InstInfo{"FXTRACT", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF5), 1, X86InstInfo{"FPREM1", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF6), 1, X86InstInfo{"FDECSTP", TYPE_X87, FLAGS_POP, 0}}, {OPD(0xD9, 0xF7), 1, X86InstInfo{"FINCSTP", TYPE_X87, FLAGS_POP, 0}}, // / 7 {OPD(0xD9, 0xF8), 1, X86InstInfo{"FPREM", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xF9), 1, X86InstInfo{"FYL2XP1", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xFA), 1, X86InstInfo{"FSQRT", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xFB), 1, X86InstInfo{"FSINCOS", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xFC), 1, X86InstInfo{"FRNDINT", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xFD), 1, X86InstInfo{"FSCALE", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xFE), 1, X86InstInfo{"FSIN", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xD9, 0xFF), 1, X86InstInfo{"FCOS", TYPE_X87, FLAGS_NONE, 0}}, // 0xDA {OPDReg(0xDA, 0), 1, X86InstInfo{"FIADD", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDA, 1), 1, X86InstInfo{"FIMUL", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDA, 2), 1, X86InstInfo{"FICOM", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDA, 3), 1, X86InstInfo{"FICOMP", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_POP, 0}}, {OPDReg(0xDA, 4), 1, X86InstInfo{"FISUB", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDA, 5), 1, X86InstInfo{"FISUBR", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDA, 6), 1, X86InstInfo{"FIDIV", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDA, 7), 1, X86InstInfo{"FIDIVR", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, // / 0 {OPD(0xDA, 0xC0), 8, X86InstInfo{"FCMOVB", TYPE_X87, FLAGS_NONE, 0}}, // / 1 {OPD(0xDA, 0xC8), 8, X86InstInfo{"FCMOVE", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xDA, 0xD0), 8, X86InstInfo{"FCMOVBE", TYPE_X87, FLAGS_NONE, 0}}, // / 3 {OPD(0xDA, 0xD8), 8, X86InstInfo{"FCMOVU", TYPE_X87, FLAGS_NONE, 0}}, // / 4 {OPD(0xDA, 0xE0), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 5 {OPD(0xDA, 0xE8), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(0xDA, 0xE9), 1, X86InstInfo{"FUCOMPP", TYPE_X87, FLAGS_POP, 0}}, {OPD(0xDA, 0xEA), 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 6 {OPD(0xDA, 0xF0), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 7 {OPD(0xDA, 0xF8), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // 0xDB {OPDReg(0xDB, 0), 1, X86InstInfo{"FILD", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDB, 1), 1, X86InstInfo{"FISTTP", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDB, 2), 1, X86InstInfo{"FIST", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPDReg(0xDB, 3), 1, X86InstInfo{"FISTP", TYPE_X87, GenFlagsSrcSize(SIZE_32BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDB, 4), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPDReg(0xDB, 5), 1, X86InstInfo{"FLD", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xDB, 6), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPDReg(0xDB, 7), 1, X86InstInfo{"FSTP", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, // / 0 {OPD(0xDB, 0xC0), 8, X86InstInfo{"FCMOVNB", TYPE_X87, FLAGS_NONE, 0}}, // / 1 {OPD(0xDB, 0xC8), 8, X86InstInfo{"FCMOVNE", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xDB, 0xD0), 8, X86InstInfo{"FCMOVNBE", TYPE_X87, FLAGS_NONE, 0}}, // / 3 {OPD(0xDB, 0xD8), 8, X86InstInfo{"FCMOVNU", TYPE_X87, FLAGS_NONE, 0}}, // / 4 {OPD(0xDB, 0xE0), 2, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(0xDB, 0xE2), 1, X86InstInfo{"FNCLEX", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xDB, 0xE3), 1, X86InstInfo{"FNINIT", TYPE_X87, FLAGS_NONE, 0}}, {OPD(0xDB, 0xE4), 4, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 5 {OPD(0xDB, 0xE8), 8, X86InstInfo{"FUCOMI", TYPE_INST, FLAGS_NONE, 0}}, // / 6 {OPD(0xDB, 0xF0), 8, X86InstInfo{"FCOMI", TYPE_X87, FLAGS_NONE, 0}}, // / 7 {OPD(0xDB, 0xF8), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // 0xDC {OPDReg(0xDC, 0), 1, X86InstInfo{"FADD", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDC, 1), 1, X86InstInfo{"FMUL", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDC, 2), 1, X86InstInfo{"FCOM", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDC, 3), 1, X86InstInfo{"FCOMP", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS | FLAGS_POP, 0}}, {OPDReg(0xDC, 4), 1, X86InstInfo{"FSUB", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDC, 5), 1, X86InstInfo{"FSUBR", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDC, 6), 1, X86InstInfo{"FDIV", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDC, 7), 1, X86InstInfo{"FDIVR", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, // / 0 {OPD(0xDC, 0xC0), 8, X86InstInfo{"FADD", TYPE_X87, FLAGS_NONE, 0}}, // / 1 {OPD(0xDC, 0xC8), 8, X86InstInfo{"FMUL", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xDC, 0xD0), 8, X86InstInfo{"FCOM", TYPE_X87, FLAGS_X87_FLAGS, 0}}, // / 3 {OPD(0xDC, 0xD8), 8, X86InstInfo{"FCOMP", TYPE_X87, FLAGS_X87_FLAGS | FLAGS_POP, 0}}, // / 4 {OPD(0xDC, 0xE0), 8, X86InstInfo{"FSUBR", TYPE_X87, FLAGS_NONE, 0}}, // / 5 {OPD(0xDC, 0xE8), 8, X86InstInfo{"FSUB", TYPE_X87, FLAGS_NONE, 0}}, // / 6 {OPD(0xDC, 0xF0), 8, X86InstInfo{"FDIVR", TYPE_X87, FLAGS_NONE, 0}}, // / 7 {OPD(0xDC, 0xF8), 8, X86InstInfo{"FDIV", TYPE_X87, FLAGS_NONE, 0}}, // 0xDD {OPDReg(0xDD, 0), 1, X86InstInfo{"FLD", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xDD, 1), 1, X86InstInfo{"FISTTP", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDD, 2), 1, X86InstInfo{"FST", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPDReg(0xDD, 3), 1, X86InstInfo{"FSTP", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDD, 4), 1, X86InstInfo{"FRSTOR", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xDD, 5), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPDReg(0xDD, 6), 1, X86InstInfo{"FNSAVE", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPDReg(0xDD, 7), 1, X86InstInfo{"FNSTSW", TYPE_X87, GenFlagsSameSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, // / 0 {OPD(0xDD, 0xC0), 8, X86InstInfo{"FFREE", TYPE_X87, FLAGS_NONE, 0}}, // / 1 {OPD(0xDD, 0xC8), 8, X86InstInfo{"FXCH", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xDD, 0xD0), 8, X86InstInfo{"FST", TYPE_INST, FLAGS_SF_MOD_DST, 0}}, // / 3 {OPD(0xDD, 0xD8), 8, X86InstInfo{"FSTP", TYPE_X87, FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, // / 4 {OPD(0xDD, 0xE0), 8, X86InstInfo{"FUCOM", TYPE_X87, FLAGS_NONE, 0}}, // / 5 {OPD(0xDD, 0xE8), 8, X86InstInfo{"FUCOMP", TYPE_X87, FLAGS_POP, 0}}, // / 6 {OPD(0xDD, 0xF0), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 7 {OPD(0xDD, 0xF8), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // 0xDE {OPDReg(0xDE, 0), 1, X86InstInfo{"FIADD", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDE, 1), 1, X86InstInfo{"FIMUL", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDE, 2), 1, X86InstInfo{"FICOM", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDE, 3), 1, X86InstInfo{"FICOMP", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_POP, 0}}, {OPDReg(0xDE, 4), 1, X86InstInfo{"FISUB", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDE, 5), 1, X86InstInfo{"FISUBR", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDE, 6), 1, X86InstInfo{"FIDIV", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDE, 7), 1, X86InstInfo{"FIDIVR", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, // / 0 {OPD(0xDE, 0xC0), 8, X86InstInfo{"FADDP", TYPE_X87, FLAGS_POP, 0}}, // / 1 {OPD(0xDE, 0xC8), 8, X86InstInfo{"FMULP", TYPE_X87, FLAGS_POP, 0}}, // / 2 {OPD(0xDE, 0xD0), 8, X86InstInfo{"FCOMP", TYPE_X87, FLAGS_X87_FLAGS | FLAGS_POP, 0}}, // / 3 {OPD(0xDE, 0xD8), 1, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, {OPD(0xDE, 0xD9), 1, X86InstInfo{"FCOMPP", TYPE_X87, FLAGS_POP, 0}}, {OPD(0xDE, 0xDA), 6, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 4 {OPD(0xDE, 0xE0), 8, X86InstInfo{"FSUBRP", TYPE_X87, FLAGS_POP, 0}}, // / 5 {OPD(0xDE, 0xE8), 8, X86InstInfo{"FSUBP", TYPE_X87, FLAGS_POP, 0}}, // / 6 {OPD(0xDE, 0xF0), 8, X86InstInfo{"FDIVRP", TYPE_X87, FLAGS_POP, 0}}, // / 7 {OPD(0xDE, 0xF8), 8, X86InstInfo{"FDIVP", TYPE_X87, FLAGS_POP, 0}}, // 0xDF {OPDReg(0xDF, 0), 1, X86InstInfo{"FILD", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM, 0}}, {OPDReg(0xDF, 1), 1, X86InstInfo{"FISTTP", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDF, 2), 1, X86InstInfo{"FIST", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST, 0}}, {OPDReg(0xDF, 3), 1, X86InstInfo{"FISTP", TYPE_X87, GenFlagsSrcSize(SIZE_16BIT) | FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDF, 4), 1, X86InstInfo{"FBLD", TYPE_X87, FLAGS_MODRM, 0}}, {OPDReg(0xDF, 5), 1, X86InstInfo{"FILD", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS, 0}}, {OPDReg(0xDF, 6), 1, X86InstInfo{"FBSTP", TYPE_X87, FLAGS_MODRM | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, {OPDReg(0xDF, 7), 1, X86InstInfo{"FISTP", TYPE_X87, GenFlagsSrcSize(SIZE_64BIT) | FLAGS_MODRM | FLAGS_X87_FLAGS | FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, // / 0 // This instruction is a bit special. This is an undocumented(Almost) x87 instruction. // https://en.wikipedia.org/wiki/X86_instruction_listings#Undocumented_x87_instructions // https://www.pagetable.com/?p=16 // AMD Athlon Processor x86 Code Optimization Guide - `Use FFREEP Macro to Pop One Register from the FPU Stack` // ISA architecture manuals don't talk about this instruction at all // At some point the Nvidia OpenGL binary driver uses this instruction. // GCC may also end up emitting this instruction in some rare edge case! // Almost all x86 CPUs implement this, and it is expected to be around {OPD(0xDF, 0xC0), 8, X86InstInfo{"FFREEP", TYPE_X87, FLAGS_POP, 0}}, // / 1 {OPD(0xDF, 0xC8), 8, X86InstInfo{"FXCH", TYPE_X87, FLAGS_NONE, 0}}, // / 2 {OPD(0xDF, 0xD0), 8, X86InstInfo{"FSTP", TYPE_X87, FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, // / 3 {OPD(0xDF, 0xD8), 8, X86InstInfo{"FSTP", TYPE_X87, FLAGS_SF_MOD_DST | FLAGS_POP, 0}}, // / 4 {OPD(0xDF, 0xE0), 1, X86InstInfo{"FNSTSW", TYPE_INST, GenFlagsSameSize(SIZE_16BIT) | FLAGS_SF_DST_RAX, 0}}, {OPD(0xDF, 0xE1), 7, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, // / 5 {OPD(0xDF, 0xE8), 8, X86InstInfo{"FUCOMIP", TYPE_INST, FLAGS_POP, 0}}, // / 6 {OPD(0xDF, 0xF0), 8, X86InstInfo{"FCOMIP", TYPE_X87, FLAGS_POP, 0}}, // / 7 {OPD(0xDF, 0xF8), 8, X86InstInfo{"", TYPE_INVALID, FLAGS_NONE, 0}}, }; #undef OPD #undef OPDReg auto InstallToX87Table = [](auto& FinalTable, auto& LocalTable) { for (auto Op : LocalTable) { auto OpNum = Op.Op; bool Repeat = (OpNum & 0x8000) != 0; OpNum = OpNum & 0x7FF; auto Dispatcher = Op.Ptr; for (uint8_t i = 0; i < Op.Count; ++i) { LOGMAN_THROW_A_FMT(FinalTable[OpNum + i].OpcodeDispatcher.OpDispatch == nullptr, "Duplicate Entry"); FinalTable[OpNum + i].OpcodeDispatcher.OpDispatch = Dispatcher; // Flag to indicate if we need to repeat this op in {0x40, 0x80} ranges if (Repeat) { FinalTable[(OpNum | 0x40) + i].OpcodeDispatcher.OpDispatch = Dispatcher; FinalTable[(OpNum | 0x80) + i].OpcodeDispatcher.OpDispatch = Dispatcher; } } } }; GenerateX87Table(Table.data(), X87OpTable, std::size(X87OpTable)); InstallToX87Table(Table, DispatchTable); return Table; }; constexpr std::array X87F80Ops = GenerateX87TableLambda(X87F80OpTable); constexpr std::array X87F64Ops = GenerateX87TableLambda(X87F64OpTable); } ================================================ FILE: FEXCore/Source/Interface/GDBJIT/GDBJIT.cpp ================================================ // SPDX-License-Identifier: MIT #include "GDBJIT.h" #include #include #include #include #if defined(GDB_SYMBOLS_ENABLED) #include extern "C" { enum jit_actions_t { JIT_NOACTION = 0, JIT_REGISTER_FN, JIT_UNREGISTER_FN }; struct jit_code_entry { jit_code_entry* next_entry; jit_code_entry* prev_entry; const char* symfile_addr; uint64_t symfile_size; }; struct jit_descriptor { uint32_t version; /* This type should be jit_actions_t, but we use uint32_t to be explicit about the bitwidth. */ uint32_t action_flag; jit_code_entry* relevant_entry; jit_code_entry* first_entry; }; /* Make sure to specify the version statically, because the debugger may check the version before we can set it. */ constinit jit_descriptor __jit_debug_descriptor = {.version = 1}; /* GDB puts a breakpoint in this function. */ void __attribute__((noinline)) __jit_debug_register_code() { asm volatile("" ::"r"(&__jit_debug_descriptor)); }; } namespace FEXCore { void GDBJITRegister(const FEXCore::ExecutableFileInfo& Entry, uintptr_t VAFileStart, uint64_t GuestRIP, uintptr_t HostEntry, FEXCore::Core::DebugData& DebugData) { auto map = Entry.SourcecodeMap.get(); if (map) { auto FileOffset = GuestRIP - VAFileStart; auto Sym = map->FindSymbolMapping(FileOffset); auto SymName = HLE::SourcecodeSymbolMapping::SymName(Sym, Entry.Filename, HostEntry, FileOffset); fextl::vector Lines; for (const auto& GuestOpcode : DebugData.GuestOpcodes) { auto Line = map->FindLineMapping(GuestRIP + GuestOpcode.GuestEntryOffset - VAFileStart); if (Line) { Lines.push_back({Line->LineNumber, HostEntry + GuestOpcode.HostEntryOffset}); } } size_t size = sizeof(info_t) + 1 * sizeof(blocks_t) + Lines.size() * sizeof(gdb_line_mapping); auto mem = (uint8_t*)malloc(size); auto base = mem; info_t* info = (info_t*)mem; mem += sizeof(info_t); strncpy(info->filename, map->SourceFile.c_str(), 511); info->nblocks = 1; auto blocks = (blocks_t*)mem; info->blocks_ofs = mem - base; mem += info->nblocks * sizeof(blocks_t); for (int i = 0; i < info->nblocks; i++) { strncpy(blocks[i].name, SymName.c_str(), 511); blocks[i].start = HostEntry; blocks[i].end = HostEntry + DebugData.HostCodeSize; } info->nlines = Lines.size(); auto lines = (gdb_line_mapping*)mem; info->lines_ofs = mem - base; mem += info->nlines * sizeof(gdb_line_mapping); if (info->nlines) { memcpy(lines, Lines.data(), info->nlines * sizeof(gdb_line_mapping)); } auto entry = new jit_code_entry {0, 0, 0, 0}; entry->symfile_addr = (const char*)info; entry->symfile_size = size; if (__jit_debug_descriptor.first_entry) { __jit_debug_descriptor.relevant_entry->next_entry = entry; entry->prev_entry = __jit_debug_descriptor.relevant_entry; } else { __jit_debug_descriptor.first_entry = entry; } __jit_debug_descriptor.relevant_entry = entry; __jit_debug_descriptor.action_flag = JIT_REGISTER_FN; __jit_debug_register_code(); } } } // namespace FEXCore #else namespace FEXCore { void GDBJITRegister(const FEXCore::ExecutableFileInfo&, uintptr_t, uint64_t, uintptr_t, FEXCore::Core::DebugData&) { ERROR_AND_DIE_FMT("GDBSymbols support not compiled in"); } } // namespace FEXCore #endif ================================================ FILE: FEXCore/Source/Interface/GDBJIT/GDBJIT.h ================================================ // SPDX-License-Identifier: MIT #include #include namespace FEXCore { void GDBJITRegister(const FEXCore::ExecutableFileInfo&, uintptr_t VAFileStart, uint64_t GuestRIP, uintptr_t HostEntry, FEXCore::Core::DebugData&); } ================================================ FILE: FEXCore/Source/Interface/IR/IR.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::IR { class OrderedNode; /** * @brief The IROp_Header is an dynamically sized array * At the end it contains a uint8_t for the number of arguments that Op has * Then there is an unsized array of NodeWrapper arguments for the number of arguments this op has * The op structures that are including the header must ensure that they pad themselves correctly to the number of arguments used */ struct IROp_Header; /** * @brief Represents the ID of a given IR node. * * Intended to provide strong typing from other integer values * to prevent passing incorrect values to certain API functions. */ struct NodeID final { using value_type = uint32_t; constexpr NodeID() noexcept = default; constexpr explicit NodeID(value_type Value_) noexcept : Value {Value_} {} constexpr NodeID(const NodeID&) noexcept = default; constexpr NodeID& operator=(const NodeID&) noexcept = default; constexpr NodeID(NodeID&&) noexcept = default; constexpr NodeID& operator=(NodeID&&) noexcept = default; [[nodiscard]] constexpr bool IsValid() const noexcept { return Value != 0; } [[nodiscard]] constexpr bool IsInvalid() const noexcept { return !IsValid(); } constexpr void Invalidate() noexcept { Value = 0; } [[nodiscard]] constexpr auto operator<=>(const NodeID&) const noexcept = default; friend std::ostream& operator<<(std::ostream& out, NodeID ID) { out << ID.Value; return out; } friend std::istream& operator>>(std::istream& in, NodeID& ID) { in >> ID.Value; return in; } value_type Value {}; }; /** * @brief This is a very simple wrapper for our node pointers * You probably don't want to use this directly * Use OpNodeWrapper and OrderedNodeWrapper types below instead * * This is necessary to allow two things * - Reduce memory usage by having the pointer be an 32bit offset rather than the whole 64bit pointer * - Actually use an offset from a base so we aren't storing pointers for everything * - Makes IR list copying be as cheap as a memcpy * Downsides * - The IR nodes have to be allocated out of a linear array of memory * - We currently only allow a 32bit offset, so *only* 4 million nodes per list * - We have to have the base offset live somewhere else * - Has to be POD and trivially copyable * - Makes every real node access turn in to a [Base + Offset] access * - Can be confusing if you're mixing OpNodeWrapper and OrderedNodeWrapper usage */ template struct FEX_PACKED NodeWrapperBase final { // 32bit or 64bit offset doesn't matter for addressing. // We use uint32_t to be more memory efficient (Cuts our node list size in half) using NodeOffsetType = uint32_t; NodeOffsetType NodeOffset; explicit NodeWrapperBase() = default; [[nodiscard]] static NodeWrapperBase WrapOffset(NodeOffsetType Offset) { NodeWrapperBase Wrapped; Wrapped.NodeOffset = Offset; return Wrapped; } [[nodiscard]] static NodeWrapperBase WrapPtr(uintptr_t Base, uintptr_t Value) { NodeWrapperBase Wrapped; Wrapped.SetOffset(Base, Value); return Wrapped; } [[nodiscard]] static void* UnwrapNode(uintptr_t Base, NodeWrapperBase Node) { return Node.GetNode(Base); } [[nodiscard]] NodeID ID() const; [[nodiscard]] bool IsInvalid() const { return NodeOffset == 0; } [[nodiscard]] bool IsImmediate() const { return NodeOffset & (1u << 31); } [[nodiscard]] bool HasKill() const { return NodeOffset & (1u << 30); } void ClearKill() { NodeOffset &= ~(1u << 30); } void SetKill() { NodeOffset |= (1u << 30); } [[nodiscard]] bool IsPointer() const { return !IsImmediate() && !HasKill(); } [[nodiscard]] Type* GetNode(uintptr_t Base) { LOGMAN_THROW_A_FMT(IsPointer(), "Precondition"); return reinterpret_cast(Base + NodeOffset); } [[nodiscard]] const Type* GetNode(uintptr_t Base) const { LOGMAN_THROW_A_FMT(IsPointer(), "Precondition"); return reinterpret_cast(Base + NodeOffset); } void SetOffset(uintptr_t Base, uintptr_t Value) { NodeOffset = Value - Base; LOGMAN_THROW_A_FMT(IsPointer(), "Offsets are within 2GiB range"); } void SetInvalid() { NodeOffset = 0; LOGMAN_THROW_A_FMT(IsInvalid(), "Zero state"); } void SetImmediate(uint32_t Immediate) { LOGMAN_THROW_A_FMT(Immediate < (1u << 31), "Bounded"); NodeOffset = Immediate | (1u << 31); LOGMAN_THROW_A_FMT(IsImmediate(), "Encoded above"); } [[nodiscard]] uint32_t GetImmediate() const { LOGMAN_THROW_A_FMT(IsImmediate(), "Precondition: must be an immediate"); return NodeOffset & ~(1u << 31); } [[nodiscard]] friend constexpr bool operator==(const NodeWrapperBase&, const NodeWrapperBase&) = default; [[nodiscard]] static NodeWrapperBase FromImmediate(uint32_t Immediate) { NodeWrapperBase A; A.SetImmediate(Immediate); return A; } }; static_assert(std::is_trivially_copyable_v>); static_assert(sizeof(NodeWrapperBase) == sizeof(uint32_t)); using OpNodeWrapper = NodeWrapperBase; using OrderedNodeWrapper = NodeWrapperBase; struct OrderedNodeHeader { OpNodeWrapper Value; OrderedNodeWrapper Next; OrderedNodeWrapper Previous; }; static_assert(sizeof(OrderedNodeHeader) == sizeof(uint32_t) * 3); /** * @brief This is a node in our IR representation * Is a doubly linked list node that lives in a representation of a linearly allocated node list * The links in the nodes can live in a list independent of the data IR data * * ex. * Region1 : ... <-> <-> <-> ... * | * | * v v * Region2 : ...... * * In this example the OrderedNodes are allocated in one linear memory region (Not necessarily contiguous with one another linking) * The second region is contiguous but they don't have any relationship with one another directly */ class OrderedNode final { public: // These three values are laid out very specifically to make it fast to access the NodeWrappers specifically OrderedNodeHeader Header; uint32_t NumUses; // After RA, the register allocated for the node. This is the register for the // node at the time it is written, even if it is shuffled into other registers // later. In other words, it is the register destination of the instruction // represented by this OrderedNode. // // This is the raw value of a PhysicalRegister data structure. uint8_t Reg; uint8_t Pad[3]; using value_type = OrderedNodeWrapper; OrderedNode() = default; /** * @brief Appends a node to this current node * * Before. <-> <-> * After. <-> <-> <-> Next * * @return Pointer to the node being added */ value_type append(uintptr_t Base, value_type Node) { // Set Next Node's Previous to incoming node SetPrevious(Base, Header.Next, Node); // Set Incoming node's links to this node's links SetPrevious(Base, Node, Wrapped(Base)); SetNext(Base, Node, Header.Next); // Set this node's next to the incoming node SetNext(Base, Wrapped(Base), Node); // Return the node we are appending return Node; } OrderedNode* append(uintptr_t Base, OrderedNode* Node) { value_type WNode = Node->Wrapped(Base); // Set Next Node's Previous to incoming node SetPrevious(Base, Header.Next, WNode); // Set Incoming node's links to this node's links SetPrevious(Base, WNode, Wrapped(Base)); SetNext(Base, WNode, Header.Next); // Set this node's next to the incoming node SetNext(Base, Wrapped(Base), WNode); // Return the node we are appending return Node; } /** * @brief Prepends a node to the current node * Before. <-> <-> * After. <-> <-> <-> Next * * @return Pointer to the node being added */ value_type prepend(uintptr_t Base, value_type Node) { // Set the previous node's next to the incoming node SetNext(Base, Header.Previous, Node); // Set the incoming node's links SetPrevious(Base, Node, Header.Previous); SetNext(Base, Node, Wrapped(Base)); // Set the current node's link SetPrevious(Base, Wrapped(Base), Node); // Return the node we are prepending return Node; } OrderedNode* prepend(uintptr_t Base, OrderedNode* Node) { value_type WNode = Node->Wrapped(Base); // Set the previous node's next to the incoming node SetNext(Base, Header.Previous, WNode); // Set the incoming node's links SetPrevious(Base, WNode, Header.Previous); SetNext(Base, WNode, Wrapped(Base)); // Set the current node's link SetPrevious(Base, Wrapped(Base), WNode); // Return the node we are prepending return Node; } /** * @brief Gets the remaining size of the blocks from this point onward * * Doesn't find the head of the list * */ [[nodiscard]] size_t size(uintptr_t Base) const { size_t Size = 1; // Walk the list forward until we hit a sentinel value_type Current = Header.Next; while (Current.NodeOffset != 0) { ++Size; OrderedNode* RealNode = Current.GetNode(Base); Current = RealNode->Header.Next; } return Size; } void Unlink(uintptr_t Base) { // This removes the node from the list. Orphaning it // Before: <-> <-> // After: SetNext(Base, Header.Previous, Header.Next); SetPrevious(Base, Header.Next, Header.Previous); } [[nodiscard]] const IROp_Header* Op(uintptr_t Base) const { return Header.Value.GetNode(Base); } [[nodiscard]] IROp_Header* Op(uintptr_t Base) { return Header.Value.GetNode(Base); } [[nodiscard]] uint32_t GetUses() const { return NumUses; } void AddUse() { ++NumUses; } void RemoveUse() { --NumUses; } [[nodiscard]] value_type Wrapped(uintptr_t Base) const { value_type Tmp; Tmp.SetOffset(Base, reinterpret_cast(this)); return Tmp; } private: [[nodiscard]] value_type WrappedOffset(uint32_t Offset) const { value_type Tmp; Tmp.NodeOffset = Offset; return Tmp; } static void SetPrevious(uintptr_t Base, value_type Node, value_type New) { OrderedNode* RealNode = Node.GetNode(Base); RealNode->Header.Previous = New; } static void SetNext(uintptr_t Base, value_type Node, value_type New) { OrderedNode* RealNode = Node.GetNode(Base); RealNode->Header.Next = New; } void SetUses(uint32_t Uses) { NumUses = Uses; } }; static_assert(std::is_trivially_constructible_v); static_assert(std::is_trivially_copyable_v); static_assert(offsetof(OrderedNode, Header) == 0); static_assert(sizeof(OrderedNode) == (sizeof(OrderedNodeHeader) + 2 * sizeof(uint32_t))); // This is temporary. We are transitioning away from OrderedNode's in favour of // flat Ref words. To ease porting, we have this typedef. Eventually OrderedNode // will be removed and this typedef will be replaced by something like: // // struct Ref { // uint Flags : 1; // uint ID : 23; // uint Reg : 8; // }; using Ref = OrderedNode*; /* This iterator can be used to step though nodes. * Due to how our IR is laid out, this can be used to either step * though the CodeBlocks or though the code within a single block. */ class NodeIterator { public: struct value_type final { OrderedNode* Node; IROp_Header* Header; }; using size_type = std::size_t; using difference_type = std::ptrdiff_t; using reference = value_type&; using const_reference = const value_type&; using pointer = value_type*; using const_pointer = const value_type*; using iterator = NodeIterator; using const_iterator = const NodeIterator; using reverse_iterator = iterator; using const_reverse_iterator = const_iterator; using iterator_category = std::bidirectional_iterator_tag; NodeIterator(uintptr_t Base, uintptr_t IRBase) : BaseList {Base} , IRList {IRBase} {} explicit NodeIterator(uintptr_t Base, uintptr_t IRBase, OrderedNodeWrapper Ptr) : BaseList {Base} , IRList {IRBase} , Node {Ptr} {} [[nodiscard]] bool operator==(const NodeIterator& rhs) const { return Node.NodeOffset == rhs.Node.NodeOffset; } [[nodiscard]] bool operator!=(const NodeIterator& rhs) const { return !operator==(rhs); } NodeIterator operator++() { OrderedNodeHeader* RealNode = reinterpret_cast(Node.GetNode(BaseList)); Node = RealNode->Next; return *this; } NodeIterator operator--() { OrderedNodeHeader* RealNode = reinterpret_cast(Node.GetNode(BaseList)); Node = RealNode->Previous; return *this; } [[nodiscard]] value_type operator*() { OrderedNode* RealNode = Node.GetNode(BaseList); return {RealNode, RealNode->Op(IRList)}; } [[nodiscard]] value_type operator()() { OrderedNode* RealNode = Node.GetNode(BaseList); return {RealNode, RealNode->Op(IRList)}; } [[nodiscard]] NodeID ID() const { return Node.ID(); } [[nodiscard]] static NodeIterator Invalid() { return NodeIterator(0, 0); } protected: uintptr_t BaseList {}; uintptr_t IRList {}; OrderedNodeWrapper Node {}; }; // This must directly match bytes to the named opsize. // Implicit sized IR operations does math to get between sizes. enum class OpSize : uint8_t { iUnsized = 0, i8Bit = 1, i16Bit = 2, i32Bit = 4, i64Bit = 8, f80Bit = 10, i128Bit = 16, i256Bit = 32, iInvalid = 0xFF, }; enum class FloatCompareOp : uint8_t { EQ = 0, LT, LE, UNO, NEQ, ORD, }; enum class ShiftType : uint8_t { LSL = 0, LSR, ASR, ROR, }; enum class BranchHint : uint8_t { None = 0, Call, Return, CheckTF }; // Converts a size stored as an integer in to an OpSize enum. // This is a nop operation and will be eliminated by the compiler. static inline OpSize SizeToOpSize(uint8_t Size) { switch (Size) { case 0: return OpSize::iUnsized; case 1: return OpSize::i8Bit; case 2: return OpSize::i16Bit; case 4: return OpSize::i32Bit; case 8: return OpSize::i64Bit; case 10: return OpSize::f80Bit; case 16: return OpSize::i128Bit; case 32: return OpSize::i256Bit; case 0xFF: return OpSize::iInvalid; default: FEX_UNREACHABLE; } } // This is a nop operation and will be eliminated by the compiler. static inline uint8_t OpSizeToSize(IR::OpSize Size) { switch (Size) { case OpSize::iUnsized: return 0; case OpSize::i8Bit: return 1; case OpSize::i16Bit: return 2; case OpSize::i32Bit: return 4; case OpSize::i64Bit: return 8; case OpSize::f80Bit: return 10; case OpSize::i128Bit: return 16; case OpSize::i256Bit: return 32; case OpSize::iInvalid: return 0xFF; default: FEX_UNREACHABLE; } } static inline uint16_t OpSizeAsBits(IR::OpSize Size) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); return IR::OpSizeToSize(Size) * 8u; } template requires (std::is_integral_v) static inline OpSize operator<<(IR::OpSize Size, T Shift) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); return IR::SizeToOpSize(IR::OpSizeToSize(Size) << Shift); } template requires (std::is_integral_v) static inline OpSize operator>>(IR::OpSize Size, T Shift) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); return IR::SizeToOpSize(IR::OpSizeToSize(Size) >> Shift); } static inline OpSize operator/(IR::OpSize Size, IR::OpSize Divisor) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); return IR::SizeToOpSize(IR::OpSizeToSize(Size) / IR::OpSizeToSize(Divisor)); } template requires (std::is_integral_v) static inline OpSize operator/(IR::OpSize Size, T Divisor) { LOGMAN_THROW_A_FMT(Size != IR::OpSize::iInvalid, "Invalid Size"); return IR::SizeToOpSize(IR::OpSizeToSize(Size) / Divisor); } static inline uint8_t NumElements(IR::OpSize RegisterSize, IR::OpSize ElementSize) { LOGMAN_THROW_A_FMT(RegisterSize != IR::OpSize::iInvalid && ElementSize != IR::OpSize::iInvalid && RegisterSize != IR::OpSize::iUnsized && ElementSize != IR::OpSize::iUnsized, "Invalid Size"); return IR::OpSizeToSize(RegisterSize) / IR::OpSizeToSize(ElementSize); } #define IROP_ENUM #define IROP_STRUCTS #define IROP_SIZES #define IROP_REG_CLASSES #include /* This iterator can be used to step though every single node in a multi-block in SSA order. * * Iterates in the order of: * * end <-- CodeBlockA <--> BlockAInst1 <--> BlockAInst2 <--> CodeBlockB <--> BlockBInst1 <--> BlockBInst2 --> end */ class AllNodesIterator : public NodeIterator { public: AllNodesIterator(uintptr_t Base, uintptr_t IRBase) : NodeIterator(Base, IRBase) {} explicit AllNodesIterator(uintptr_t Base, uintptr_t IRBase, OrderedNodeWrapper Ptr) : NodeIterator(Base, IRBase, Ptr) {} AllNodesIterator(NodeIterator other) : NodeIterator(other) {} // Allow NodeIterator to be upgraded AllNodesIterator operator++() { OrderedNodeHeader* RealNode = reinterpret_cast(Node.GetNode(BaseList)); auto IROp = Node.GetNode(BaseList)->Op(IRList); // If this is the last node of a codeblock, we need to continue to the next block if (IROp->Op == OP_ENDBLOCK) { auto EndBlock = IROp->C(); auto CurrentBlock = EndBlock->BlockHeader.GetNode(BaseList); Node = CurrentBlock->Header.Next; } else if (IROp->Op == OP_CODEBLOCK) { auto CodeBlock = IROp->C(); Node = CodeBlock->Begin; } else { Node = RealNode->Next; } return *this; } AllNodesIterator operator--() { auto IROp = Node.GetNode(BaseList)->Op(IRList); if (IROp->Op == OP_BEGINBLOCK) { auto BeginBlock = IROp->C(); Node = BeginBlock->BlockHeader; } else if (IROp->Op == OP_CODEBLOCK) { auto PrevBlockWrapper = Node.GetNode(BaseList)->Header.Previous; auto PrevCodeBlock = PrevBlockWrapper.GetNode(BaseList)->Op(IRList)->C(); Node = PrevCodeBlock->Last; } else { Node = Node.GetNode(BaseList)->Header.Previous; } return *this; } [[nodiscard]] static AllNodesIterator Invalid() { return AllNodesIterator(0, 0); } }; class IRListView; class IREmitter; template inline NodeID NodeWrapperBase::ID() const { return NodeID(NodeOffset / sizeof(IR::OrderedNode)); } [[nodiscard]] bool IsBlockExit(FEXCore::IR::IROps Op); void Dump(fextl::stringstream* out, const IRListView* IR); constexpr auto format_as(FEXCore::IR::NodeID ID) { return ID.Value; } FEX_DEFINE_ENUM_FMT_PASSTHROUGH(FEXCore::IR::FenceType) FEX_DEFINE_ENUM_FMT_PASSTHROUGH(FEXCore::IR::MemOffsetType) FEX_DEFINE_ENUM_FMT_PASSTHROUGH(FEXCore::IR::OpSize) FEX_DEFINE_ENUM_FMT_PASSTHROUGH(FEXCore::IR::RegClass) } // namespace FEXCore::IR template<> struct std::hash { size_t operator()(const FEXCore::IR::NodeID& ID) const noexcept { return std::hash {}(ID.Value); } }; ================================================ FILE: FEXCore/Source/Interface/IR/IR.json ================================================ { "Docs": [ "IRTypes define types that can be used directly in the IR.", "These will translate to the underlying C types when stored in the op data", "", "SSA types are special cased", " SSA = untyped", " GPR = GPR class type", " FPR = FPR class type", "Declaring the SSA types correctly will allow validation passes to ensure the op is getting passed correct arguments", "", "Arguments must always follow a particular order. :", "Type must always be an IRType", "Prefix currently can be one of the following: #, $", " #: This is a temporary argument that is in the IR Emitter arguments", " - This will not be stored in the resulting IR op data structure", " $: This is a value that will be stored inside of the IR op data structure", " - If it is type SSA, GPR, or FPR then it is an SSA type", " - These will get added to the SSA argument union to ensure RA happens", "", "IR op definition follows the structure of = ", "", "Eg:", "IR op with no result and no arguments", " CallbackReturn", "", "IR op with result and no arguments", " GPR = ProcessorID", "", "IR op with no result and non-SSA argument", " Fence FenceType:$Type", "", "IR op with no result and SSA arguments", " SetRoundingMode GPR:$Mode", "", "IR op with result and SSA arguments", " GPR = Add GPR:$Src1, GPR:$Src2", "", "## Op members ##", "* Desc", " * List of text for documenting this IR op.", "* OpClass", " * Textual class to group IR ops by type", "* DestClass", " * SSA class of the return when the return type is `SSA`", " * Not used if the destination type is one of {GPR, FPR}", "* DestSize", " * The size of the destination type", "* EmitValidation", " * List of validations to emit for the IR emitter", " * These are validations that can't be automatically inferred and need to be hand-written", "" ], "Enums": { "class CondClass : uint8_t": [ "EQ = 0,", "NEQ = 1,", "UGE = 2,", "ULT = 3,", "MI = 4,", "PL = 5,", "VS = 6,", "VC = 7,", "UGT = 8,", "ULE = 9,", "SGE = 10,", "SLT = 11,", "SGT = 12,", "SLE = 13,", "TSTZ = 14, /* bit test zero */", "TSTNZ = 15, /* bit test nonzero */", "", "FLU = 16, /* float less or unordered */", "FGE = 17, /* float greater or equal */", "FLEU = 18, /* float less or equal or unordered */", "FGT = 19, /* float greater */", "FU = 20, /* float unordered */", "FNU = 21, /* float not unordered */", "", "AL = 32, /* always */" ], "class FenceType : uint8_t": [ "Load = 0,", "Store = 1,", "LoadStore = 2,", "Inst = 3," ], "class MemOffsetType : uint8_t": [ "SXTX = 0,", "UXTW = 1,", "SXTW = 2," ], "class RegClass : uint32_t": [ "Invalid = 0,", "GPR = 1,", "GPRFixed = 2,", "FPR = 3,", "FPRFixed = 4,", "Complex = 5," ], "class RoundMode : uint8_t": [ "Nearest = 0,", "NegInfinity = 1,", "PosInfinity = 2,", "TowardsZero = 3, /* Truncate */", "Host = 4," ], "class ConstPad : uint8_t": [ "NoPad = 0,", "DoPad = 1,", "AutoPad = 2," ] }, "Defines": [ "constexpr uint8_t NumClasses {6}", "constexpr uint8_t FCMP_FLAG_EQ = 0", "constexpr uint8_t FCMP_FLAG_LT = 1", "constexpr uint8_t FCMP_FLAG_UNORDERED = 2", "struct BreakDefinition {", " uint16_t ErrorRegister;", " uint8_t Signal;", " uint8_t TrapNumber;", " uint8_t si_code;", "};" ], "IRTypes" : { "i1": "bool", "i8": "int8_t", "i16": "int16_t", "i32": "int32_t", "i64": "int64_t", "u8": "uint8_t", "u16": "uint16_t", "u32": "uint32_t", "u64": "uint64_t", "OpSize": "FEXCore::IR::OpSize", "SSA": "OrderedNode*", "GPR": "OrderedNode*", "FPR": "OrderedNode*", "FenceType": "FenceType", "RegisterClass": "RegClass", "CondClass": "CondClass", "SHA256Sum": "SHA256Sum", "MemOffsetType": "MemOffsetType", "BreakDefinition": "BreakDefinition", "RoundType": "RoundMode", "ConstPad": "ConstPad", "FloatCompareOp": "FloatCompareOp", "NamedVectorConstant": "FEXCore::IR::NamedVectorConstant", "IndexNamedVectorConstant": "FEXCore::IR::IndexNamedVectorConstant", "ShiftType": "FEXCore::IR::ShiftType", "BranchHint": "FEXCore::IR::BranchHint", "Array16": "std::array" }, "Ops": { "Misc": { "Dummy": { "HasSideEffects": true, "SwitchGen": false, "JITDispatchOverride": "NoOp" }, "IRHeader SSA:$Blocks, u64:$OriginalRIP, u32:$BlockCount, u32:$NumHostInstructions, u32:$SpillSlots, i1:$PostRA{false}, i1:$HasX87{false}, i1:$ReadsParity{false}": { "SwitchGen": false, "JITDispatchOverride": "NoOp" }, "CodeBlock SSA:$Begin, SSA:$Last, u32:$ID, i1:$EntryPoint{false}, u32:$GuestEntryOffset{0}": { "SwitchGen": false, "RAOverride": "0", "JITDispatchOverride": "NoOp" }, "BeginBlock SSA:$BlockHeader": { "HasSideEffects": true, "SwitchGen": false, "RAOverride": "0", "JITDispatchOverride": "NoOp" }, "InvalidateFlags u64:$Flags": { "HasSideEffects": true, "JITDispatchOverride": "NoOp" }, "EndBlock SSA:$BlockHeader": { "HasSideEffects": true, "SwitchGen": false, "RAOverride": "0", "JITDispatchOverride": "NoOp" }, "GuestOpcode u32:$GuestEntryOffset": { "Desc": ["Marks the beginning of a guest opcode"], "HasSideEffects": true }, "GPR = ValidateCode Array16:$CodeOriginal, GPR:$Address, u8:$CodeLength": { "HasSideEffects": true, "HasDest": true, "DestSize": "OpSize::i64Bit" }, "ThreadRemoveCodeEntry": { "HasSideEffects": true }, "GPR = ProcessorID": { "Desc": ["Returns the processor ID correlating to the current running CPU", "This may be out of date by time this instruction is executed so care must be taken", "This same information can be gotten from syscall getcpu(&cpu, &node)", "uint32_t Res = (node << 12) | cpu;", "This means it has a limitation of 4096 CPU cores. Which is fine and matches x86 behaviour" ], "DestSize": "OpSize::i64Bit" }, "GPR = GetRoundingMode": { "Desc": ["Gets the current rounding mode options" ], "DestSize": "OpSize::i32Bit" }, "SetRoundingMode GPR:$RoundMode, i1:$SetDAZ, GPR:$MXCSR": { "Desc": ["Sets the current rounding mode options for the thread" ], "HasSideEffects": true }, "GPR = PushRoundingMode u8:$RoundMode": { "Desc": ["Override the current rounding mode options for the thread, returning old FPCR" ], "DestSize": "OpSize::i64Bit", "HasSideEffects": true }, "PopRoundingMode GPR:$FPCR": { "Desc": ["Resets rounding mode after PushRoundingMode operation" ], "HasSideEffects": true }, "Print SSA:$Value": { "HasSideEffects": true, "Desc": ["Debug operation that prints an SSA value to the console", "May only print 64bits of the value"] }, "GPR = AllocateGPR i1:$ForPair": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", "Note: if an instruction uses allocated destinations-as-sources,", "it cannot use a regular destination too. This ensures RA correctness.", "This is a kludge to deal with the IR's lack of multiple destinations", "If ForPair is set, RA will try to allocate the base of a register pair"], "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = AllocateFPR OpSize:#RegisterSize, OpSize:#ElementSize": { "Desc": ["Like AllocateGPR, but for FPR"], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "JITDispatch": false }, "GPR = AllocateGPRAfter GPR:$After": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", "This is a kludge to deal with the IR's lack of multiple destinations", "RA will attempt to allocate to the register after $After.", "It may not succeed."], "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "GPR = RDRAND i1:$GetReseeded": { "Desc": ["Uses the hardware random number generator to generate a 64bit number", "The boolean argument asks if we should be reading the reseeded number or not", "Reseeded RNG calculation is more expensive and will be heavier to use", "Returns the 64-bit number", "Sets the Z flag if the number is valid.", "RNG hardware is allowed to fail early and return. Software must always check this" ], "HasSideEffects": true, "DestSize": "OpSize::i64Bit" }, "Yield": { "HasSideEffects": true, "Desc": ["This is a hint instruction that the CPU is likely to do a spin so it might want to pause to help out SMP", "Can be implemented as a NOP if necessary"] }, "WFET GPR:$Upper, GPR:$Lower": { "HasSideEffects": true, "Desc": [ "Implement a low power wait attempting to sleep until RDTSC >= Upper:Lower.", "Will spuriously wake up." ] }, "MonoBackpatcherWrite OpSize:$Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": [ "Writes and invalidates the target address with the invalidation mutex locked. This is a fault-avoiding", "replacement for the atomic SMC writes used in the mono callsite backpatcher." ], "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] } }, "Branch": { "Jump SSA:$TargetBlock": { "HasSideEffects": true, "RAOverride": "0" }, "CondJump SSA:$Cmp1, SSA:$Cmp2, SSA:$TrueBlock, SSA:$FalseBlock, CondClass:$Cond{CondClass::NEQ}, OpSize:$CompareSize{OpSize::iInvalid}, i1:$FromNZCV{false}": { "Inline": ["", "AddSub"], "HasSideEffects": true, "RAOverride": "2" }, "ExitFunction OpSize:#Size, GPR:$NewRIP, BranchHint:$Hint, GPR:$CallReturnAddress, SSA:$CallReturnBlock": { "Desc": ["Exits the current JIT function with a target RIP" ], "Inline": ["Any"], "HasSideEffects": true, "DestSize": "Size", "RAOverride": "2" }, "Break BreakDefinition:$Reason": { "HasSideEffects": true }, "CallbackReturn": { "HasSideEffects": true }, "GPR = Syscall GPR:$SyscallID, GPR:$Arg0, GPR:$Arg1, GPR:$Arg2, GPR:$Arg3, GPR:$Arg4, GPR:$Arg5": { "HasSideEffects": true, "Desc": ["Dispatches a guest syscall through to the SyscallHandler class" ], "DestSize": "OpSize::i64Bit" }, "Thunk GPR:$ArgPtr, SHA256Sum:$ThunkNameHash": { "HasSideEffects": true }, "GPR:$EAX, GPR:$EBX, GPR:$ECX, GPR:$EDX = CPUID GPR:$Function, GPR:$Leaf": { "Desc": ["Calls in to the CPUID handler function to return emulated CPUID"], "DestSize": "OpSize::i32Bit", "HasSideEffects": true }, "GPR:$EAX, GPR:$EDX = XGetBV GPR:$Function": { "Desc": ["Calls in to the XCR handler function to return emulated XCR"], "DestSize": "OpSize::i32Bit", "HasSideEffects": true } }, "Moves": { "GPR = Copy GPR:$Source": { "Desc": ["GPR copy, generated by RA to split live ranges"], "DestSize": "OpSize::i64Bit" } }, "StaticRA": { "SSA = LoadRegister u32:$Reg, RegisterClass:$Class, OpSize:#Size": { "Desc": ["Loads a value from the given register", "Size must match the execution mode."], "DestSize": "Size" }, "GPR = LoadPF OpSize:#Size": { "Desc": ["Loads raw PF"], "DestSize": "Size" }, "GPR = LoadAF OpSize:#Size": { "Desc": ["Loads raw PF"], "DestSize": "Size" }, "SSA = StoreRegister SSA:$Value, OpSize:#Size": { "HasSideEffects": true, "Desc": ["Stores a value to a given register.", "Size must match the execution mode."], "DestSize": "Size" }, "StorePF GPR:$Value, OpSize:#Size": { "HasSideEffects": true, "Desc": ["Stores raw PF"], "DestSize": "Size" }, "StoreAF GPR:$Value, OpSize:#Size": { "HasSideEffects": true, "Desc": ["Stores raw AF"], "DestSize": "Size" } }, "Memory": { "SSA = LoadContext OpSize:#ByteSize, RegisterClass:$Class, u32:$Offset": { "Desc": ["Loads a value from the context with offset", "Dest = Ctx[Offset]" ], "DestSize": "ByteSize", "EmitValidation": [ "($Class == RegClass::GPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit)) || $Class == RegClass::FPR", "($Class == RegClass::FPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit || #ByteSize == IR::OpSize::i128Bit || #ByteSize == IR::OpSize::i256Bit)) || $Class == RegClass::GPR", "!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't LoadContext to GPR\"", "!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't LoadContext to XMM\"" ] }, "SSA:$Value1, SSA:$Value2 = LoadContextPair OpSize:#ByteSize, RegisterClass:$Class, u32:$Offset": { "Desc": ["Loads a pair of values from the context with offset", "Value0 = Ctx[Offset], Value1 = Ctx[Offset + ByteSize]" ], "HasSideEffects": true, "DestSize": "ByteSize", "EmitValidation": [ "($Class == RegClass::GPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit)) || $Class == RegClass::FPR", "($Class == RegClass::FPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit || #ByteSize == IR::OpSize::i128Bit || #ByteSize == IR::OpSize::i256Bit)) || $Class == RegClass::GPR", "!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't LoadContext to GPR\"", "!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't LoadContext to XMM\"" ] }, "StoreContext OpSize:#ByteSize, RegisterClass:$Class, SSA:$Value, u32:$Offset": { "Desc": ["Stores a value to the context with offset", "Ctx[Offset] = Value", "Zero Extends if value's type is too small", "Truncates if value's type is too large" ], "Inline": ["Zero", ""], "HasSideEffects": true, "DestSize": "ByteSize", "EmitValidation": [ "($Class == RegClass::GPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit)) || $Class == RegClass::FPR", "($Class == RegClass::FPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit || #ByteSize == IR::OpSize::i128Bit || #ByteSize == IR::OpSize::i256Bit)) || $Class == RegClass::GPR", "!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't StoreContext to GPR\"", "!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't StoreContext to XMM\"" ] }, "StoreContextPair OpSize:#ByteSize, RegisterClass:$Class, SSA:$Value1, SSA:$Value2, u32:$Offset": { "Desc": ["Stores a pair of values to the context with offset", "Ctx[Offset] = Value1, Ctx[Offset + ByteSize] = Value2", "Zero Extends if value's type is too small", "Truncates if value's type is too large" ], "HasSideEffects": true, "DestSize": "ByteSize", "EmitValidation": [ "WalkFindRegClass($Value1) == $Class", "WalkFindRegClass($Value2) == $Class", "($Class == RegClass::GPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit)) || $Class == RegClass::FPR", "($Class == RegClass::FPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit || #ByteSize == IR::OpSize::i128Bit || #ByteSize == IR::OpSize::i256Bit)) || $Class == RegClass::GPR", "!($Offset >= offsetof(Core::CPUState, gregs[0]) && $Offset < offsetof(Core::CPUState, gregs[16])) && \"Can't StoreContext to GPR\"", "!($Offset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $Offset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't StoreContext to XMM\"" ] }, "SSA = LoadContextIndexed GPR:$Index, OpSize:#ByteSize, u32:$BaseOffset, u32:$Stride, RegisterClass:$Class": { "Desc": ["Loads a value from the context with offset and indexed by SSA value", "Dest = Ctx[BaseOffset + Index * Stride]" ], "DestSize": "ByteSize", "EmitValidation": [ "($Class == RegClass::GPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit)) || $Class == RegClass::FPR", "($Class == RegClass::FPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit || #ByteSize == IR::OpSize::i128Bit || #ByteSize == IR::OpSize::i256Bit)) || $Class == RegClass::GPR", "!($BaseOffset >= offsetof(Core::CPUState, gregs[0]) && $BaseOffset < offsetof(Core::CPUState, gregs[16])) && \"Can't LoadContextIndexed to GPR\"", "!($BaseOffset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $BaseOffset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't LoadContextIndexed to XMM\"" ] }, "StoreContextIndexed SSA:$Value, GPR:$Index, OpSize:#ByteSize, u32:$BaseOffset, u32:$Stride, RegisterClass:$Class": { "HasSideEffects": true, "Desc": ["Stores a value to the context with offset and indexed by SSA value", "Ctx[BaseOffset + Index * Stride] = Value" ], "DestSize": "ByteSize", "EmitValidation": [ "WalkFindRegClass($Value) == $Class", "($Class == RegClass::GPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit)) || $Class == RegClass::FPR", "($Class == RegClass::FPR && (#ByteSize == IR::OpSize::i8Bit || #ByteSize == IR::OpSize::i16Bit || #ByteSize == IR::OpSize::i32Bit || #ByteSize == IR::OpSize::i64Bit || #ByteSize == IR::OpSize::i128Bit || #ByteSize == IR::OpSize::i256Bit)) || $Class == RegClass::GPR", "!($BaseOffset >= offsetof(Core::CPUState, gregs[0]) && $BaseOffset < offsetof(Core::CPUState, gregs[16])) && \"Can't StoreContextIndexed to GPR\"", "!($BaseOffset >= offsetof(Core::CPUState, xmm.avx.data[0]) && $BaseOffset < offsetof(Core::CPUState, xmm.avx.data[16])) && \"Can't StoreContextIndexed to XMM\"" ] }, "GPR = FormContextAddress OpSize:#Size, GPR:$Index, u32:$Stride": { "Desc": ["Forms an address into the context structure indexed by SSA value", "Dest = Ctx + Index * Stride", "This allows backends to compute the address once and reuse it for multiple memory operations", "Stride must be a power of 2" ], "DestSize": "Size", "EmitValidation": [ "#Size == IR::OpSize::i64Bit" ] }, "SpillRegister SSA:$Value, u32:$Slot, RegisterClass:$Class": { "HasSideEffects": true, "Desc": ["Spills an SSA value to memory", "Spill slots are register allocated and has live ranges calculated to handle slot calculation", "!Don't use this op. It is for RA to handle spilling and filling!" ], "EmitValidation": [ "WalkFindRegClass($Value) == $Class" ] }, "SSA = FillRegister OpSize:#Size, OpSize:#ElementSize, u32:$Slot, RegisterClass:$Class": { "Desc": ["Fills a register from a spill slot", "Spill slots are register allocated and has live ranges calculated to handle slot calculation", "!Don't use this op. It is for RA to handle spilling and filling!" ], "DestSize": "Size", "ElementSize": "ElementSize" }, "GPR = LoadNZCV": { "Desc": ["Loads value of NZCV register"], "DestSize": "OpSize::i32Bit" }, "StoreNZCV GPR:$Value": { "HasSideEffects": true, "Desc": ["Stores value to NZCV register"], "DestSize": "OpSize::i32Bit" }, "GPR = LoadDF": { "Desc": ["Loads the decimal flag from the context object in -1/1", "representation for easy consumption" ], "DestSize": "OpSize::i64Bit" }, "SSA = LoadMem RegisterClass:$Class, OpSize:#Size, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Inline": ["", "Mem"], "DestSize": "Size" }, "SSA:$Value1, SSA:$Value2 = LoadMemPair RegisterClass:$Class, OpSize:#Size, GPR:$Addr, u32:$Offset": { "Desc": ["Load a pair of values from memory."], "DestSize": "Size", "HasSideEffects": true }, "StoreMem RegisterClass:$Class, OpSize:#Size, SSA:$Value, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": [ "Stores a value to memory.", "Zero Extends if value's type is too small", "Truncates if value's type is too large" ], "Inline": ["Zero", "", "Mem"], "HasSideEffects": true, "DestSize": "Size" }, "StoreMemPair RegisterClass:$Class, OpSize:#Size, SSA:$Value1, SSA:$Value2, GPR:$Addr, u32:$Offset": { "Desc": [ "Stores a pair of values to memory.", "Zero Extends if value's type is too small", "Truncates if value's type is too large" ], "Inline": ["Zero", "Zero"], "HasSideEffects": true, "DestSize": "Size" }, "StoreMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, GPR:$Addr": { "Desc": [ "Stores a value to memory using SVE predicate mask that's designed", "specifically for use in the X87 SVE Ldst optimization." ], "DestSize": "RegisterSize", "HasSideEffects": true, "ElementSize": "ElementSize" }, "FPR = LoadMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Addr": { "Desc": [ "Loads a value to memory using SVE predicate mask that's designed", "specifically for use in the X87 SVE Ldst optimization." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "SSA = LoadMemTSO RegisterClass:$Class, OpSize:#Size, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": ["Does a x86 TSO compatible load from memory. Offset must be Invalid()." ], "Inline": ["", "Memtso"], "DestSize": "Size" }, "StoreMemTSO RegisterClass:$Class, OpSize:#Size, SSA:$Value, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": ["Does a x86 TSO compatible store to memory. Offset must be Invalid()." ], "Inline": ["Zero", "", "Memtso"], "HasSideEffects": true, "DestSize": "Size" }, "FPR = VLoadVectorMasked OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Mask, GPR:$Addr, GPR:$Offset, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": ["Does a masked load similar to VPMASKMOV/VMASKMOV where the upper bit of each element", "determines whether or not that element will be loaded from memory"], "ImplicitFlagClobber": true, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "VStoreVectorMasked OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Mask, FPR:$Data, GPR:$Addr, GPR:$Offset, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": ["Does a masked store similar to VPMASKMOV/VMASKMOV where the upper bit of each element", "determines whether or not that element will be stored to memory"], "HasSideEffects": true, "ImplicitFlagClobber": true, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VLoadVectorGatherMasked OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Incoming, FPR:$Mask, GPR:$AddrBase, FPR:$VectorIndexLow, FPR:$VectorIndexHigh, OpSize:$VectorIndexElementSize, u8:$OffsetScale, u8:$DataElementOffsetStart, u8:$IndexElementOffsetStart, OpSize:$AddrSize": { "Desc": [ "Does a masked load similar to VPGATHERD* where the upper bit of each element", "determines whether or not that element will be loaded from memory.", "Most of VSIB encoding is passed directly through to the IR operation." ], "TiedSource": 0, "ImplicitFlagClobber": true, "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "$VectorIndexElementSize == OpSize::i32Bit || $VectorIndexElementSize == OpSize::i64Bit" ] }, "FPR = VLoadVectorGatherMaskedQPS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Incoming, FPR:$MaskReg, GPR:$AddrBase, FPR:$VectorIndexLow, FPR:$VectorIndexHigh, u8:$OffsetScale, OpSize:$AddrSize": { "Desc": [ "Does a masked load similar to VPGATHERQPS where the upper bit of each element", "determines whether or not that element will be loaded from memory.", "Most of VSIB encoding is passed directly through to the IR operation.", "Only supports the case of 32-bit data element sizes from 64-bit addresses" ], "TiedSource": 0, "ImplicitFlagClobber": true, "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "ElementSize == OpSize::i32Bit", "RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\"" ] }, "FPR = VLoadVectorElement OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$DstSrc, u8:$Index, GPR:$Addr": { "Desc": ["Does a memory load to a single element of a vector.", "Leaves the rest of the vector's data intact.", "Matches arm64 ld1 semantics"], "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "VStoreVectorElement OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, u8:$Index, GPR:$Addr": { "Desc": ["Does a memory store of a single element of a vector.", "Matches arm64 st1 semantics"], "HasSideEffects": true, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VBroadcastFromMem OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Address": { "Desc": ["Broadcasts an ElementSize value from memory into each element of a vector."], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "GPR = Push OpSize:#Size, OpSize:$ValueSize, GPR:$Value, GPR:$Addr": { "Desc": [ "Pushes a value to the address, returning the new pointer after incrementing.", "The address is decremented by the value size while.", "The return value size is the size of the current operating mode" ], "TiedSource": 1, "HasSideEffects": true, "DestSize": "Size" }, "PushTwo OpSize:#Size, OpSize:$ValueSize, GPR:$Value1, GPR:$Value2, GPR:$Addr": { "Desc": [ "Push two values to the address, incrementing the pointer in the place.", "Fused post-RA so doesn't have a destination." ], "HasSideEffects": true }, "GPR = RMWHandle GPR:$Value": { "Desc": [ "This is a special move that indicates the result will be poisoned by a non-SSA instruction writing to its result.", "In effect, it serves to prevent invalid optimizations with non-SSA instructions." ], "DestSize": "OpSize::i64Bit", "HasSideEffects": true, "TiedSource": 0 }, "GPR:$Addr, GPR:$Value = Pop OpSize:$Size, GPR:$Addr": { "Desc": [ "Pops a value from the address, updating the new pointer after incrementing.", "The address is incremented by the size via an RMW source/destintaion." ], "HasSideEffects": true, "DestSize": "Size" }, "GPR:$Addr, GPR:$Value1, GPR:$Value2 = PopTwo OpSize:$Size, GPR:$Addr": { "Desc": ["Pop two values from the address. Fused post-RA."], "HasSideEffects": true, "DestSize": "Size" }, "GPR = MemSet i1:$IsAtomic, OpSize:$Size, GPR:$Prefix, GPR:$Addr, GPR:$Value, GPR:$Length, GPR:$Direction": { "Desc": ["Duplicates behaviour of x86 STOS repeat", "Returns the final address that gets generated without the prefix appended." ], "Inline": ["", "", "Zero", "", "Any"], "HasSideEffects": true, "DestSize": "OpSize::i64Bit" }, "GPR:$DstAddress, GPR:$SrcAddress = MemCpy i1:$IsAtomic, OpSize:$Size, GPR:$Dest, GPR:$Src, GPR:$Length, GPR:$Direction": { "Desc": ["Duplicates behaviour of x86 MOVS repeat", "Returns the final addresses after they have been incremented or decremented" ], "Inline": ["", "", "", "Any"], "HasSideEffects": true, "DestSize": "OpSize::i64Bit" }, "CacheLineClear GPR:$Addr, i1:$Serialize": { "Desc": ["Does a 64 byte cacheline clear at the address specified", "Only clears the data cachelines. Doesn't do any zeroing", "Can skip serialization if requested." ], "HasSideEffects": true }, "CacheLineClean GPR:$Addr": { "Desc": ["Does a 64 byte cacheline cleanat the address specified", "Only cleans the data cachelines. Doesn't do any zeroing", "Skips the invalidation step of the CacheLineClear operation" ], "HasSideEffects": true }, "CacheLineZero GPR:$Addr": { "Desc": ["Does a 64 byte zero at the address specified", "Writing zeroes to memory", "It is specifically non-temporal and weakly ordered", "This matches CLZero behaviour" ], "HasSideEffects": true }, "Fence FenceType:$Fence": { "Desc": ["Does a memory fence operation of the desired type", "FenceType::Load: Ensures load memory operations are serialized", "FenceType::Store: Ensures store memory operations are serialized", "FenceType::LoadStore: Ensures loads and store memory operations are serialized", "FenceType::Inst: Instruction barrier. Ensures all instructions after this point will be explicitly fetched", "Ensures the memory operations are globally visible" ], "HasSideEffects": true }, "Prefetch i1:$ForStore, i1:$Stream, i8:$CacheLevel, GPR:$Addr, GPR:$Offset, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": ["Does a cacheline prefetch operation" ], "Inline": ["", "Mem"], "EmitValidation": ["CacheLevel > 0 && CacheLevel < 4"], "HasSideEffects": true, "DestSize": "OpSize::i64Bit" }, "VStoreNonTemporal OpSize:#RegisterSize, FPR:$Value, GPR:$Addr, i8:$Offset": { "Desc": ["Does a non-temporal memory store of a vector.", "Matches arm64 SVE stnt1b semantics.", "Specifically weak-memory model ordered to match x86 non-temporal stores." ], "HasSideEffects": true, "DestSize": "RegisterSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit", "Offset % IR::OpSizeToSize(RegisterSize) == 0" ] }, "VStoreNonTemporalPair OpSize:#RegisterSize, FPR:$ValueLow, FPR:$ValueHigh, GPR:$Addr, i8:$Offset": { "Desc": ["Does a non-temporal memory store of two vector registers.", "Matches arm64 stnp semantics.", "Specifically weak-memory model ordered to match x86 non-temporal stores." ], "HasSideEffects": true, "DestSize": "RegisterSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i128Bit", "Offset % IR::OpSizeToSize(RegisterSize) == 0" ] }, "FPR = VLoadNonTemporal OpSize:#RegisterSize, GPR:$Addr, i8:$Offset": { "Desc": ["Does a non-temporal memory load of a vector.", "Matches arm64 SVE ldnt1b semantics.", "Specifically weak-memory model ordered to match x86 non-temporal stores." ], "HasSideEffects": true, "DestSize": "RegisterSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i256Bit", "Offset % IR::OpSizeToSize(RegisterSize) == 0" ] }, "ContextClear u32:$Offset, u32:$Size": { "Desc": [ "Clears a region of the context by CLZero size", "Both the offset and size alignment need to be by CLZero size" ], "HasSideEffects": true, "EmitValidation": [ "Offset % 64 == 0", "Size % 64 == 0" ] } }, "Atomic": { "GPR = CAS OpSize:#Size, GPR:$Expected, GPR:$Desired, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Does a compare and swap of values to a memory location", "This mostly matches the C++ atomic_compare_exchange_strong function", "Dest = atomic_compare_exchange_strong(%Addr, %Expected, %Desired)", "Depending on if the value in %Addr is Expected the results destination will be different", "Behaves like the following but atomically", "Dest = %Expected", "if (deref(%Addr) != %Expected) Dest = deref(%Addr)" ], "TiedSource": 0, "DestSize": "Size", "ImplicitFlagClobber": true, "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR:$Lo, GPR:$Hi = CASPair OpSize:#Size, GPR:$ExpectedLo, GPR:$ExpectedHi, GPR:$DesiredLo, GPR:$DesiredHi, GPR:$Addr": { "Desc": ["Does a compare and exchange with two pairs of values", "ssa0 is the comparison value", "ssa1 is the new value", "ssa2 is the memory location", "Returns the lower & upper halves of the value in memory." ], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicSwap OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer swap" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchAdd OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and add", "Atomically fetches %Addr and adds %value to the memory location", "Dest is the value prior to operating on the value in memory", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchSub OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and sub", "Atomically fetches %Addr and subtracts %value to the memory location", "Dest is the value prior to operating on the value in memory", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchAnd OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and binary and", "Atomically fetches %Addr and binary ands %value to the memory location", "Dest is the value prior to operating on the value in memory", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchCLR OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and binary clear", "Atomically fetches %Addr and binary clears %value to the memory location", "Dest is the value prior to operating on the value in memory", "Matches ARM ldclral semantics", "eg: Dest[Addr] &= ~Value", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchOr OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and binary or", "Atomically fetches %Addr and binary ors %value to the memory location", "Dest is the value prior to operating on the value in memory", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchXor OpSize:#Size, GPR:$Value, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and binary exclusive or", "Atomically fetches %Addr and binary exclusive ors %value to the memory location", "Dest is the value prior to operating on the value in memory", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AtomicFetchNeg OpSize:#Size, GPR:$Addr": { "HasSideEffects": true, "Desc": ["Atomic integer fetch and two's complement negate", "Dest is the value prior to operating on the value in memory", "IR layout must match NonFetch-variant, otherwise DCE IR optimization breaks!" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "TelemetrySetValue GPR:$Value, u8:$TelemetryValueIndex": { "HasSideEffects": true, "Desc": ["Set Telemetry value if the passed in 32-bit value isn't zero.", "Only useful for 32-bit applications." ], "ImplicitFlagClobber": true, "DestSize": "OpSize::i64Bit" } }, "ALU": { "GPR = EntrypointOffset OpSize:#Size, i64:$Offset": { "Desc": ["Returns the + Offset address", "When the size is 4 bytes then 32-bit overflow and underflow needs to work" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "InlineEntrypointOffset OpSize:#Size, i64:$Offset": { "Desc": ["Returns the + Offset address", "When the size is 4 bytes then 32-bit overflow and underflow needs to work" ], "HasSideEffects": true, "RAOverride": "0", "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Constant i64:$Constant, ConstPad:$Pad{IR::ConstPad::NoPad}, i32:$MaxBytes{0}": { "Desc": ["Generates a 64bit constant inside of a GPR", "Unsupported to create a constant in FPR" ], "DestSize": "OpSize::i64Bit", "EmitValidation": [ "MaxBytes >= 0 && MaxBytes <= 8 && (MaxBytes & 1) == 0", "MaxBytes == 0 || (Constant >> (MaxBytes * 8)) == 0" ] }, "InlineConstant i64:$Constant": { "Desc": ["Generates a 64bit constant to be used directly, non-FPR"], "HasSideEffects": true, "RAOverride": "0", "DestSize": "OpSize::i64Bit" }, "GPR = CycleCounter i1:$SelfSynchronizingLoads": { "Desc": ["Returns the host 64bit cycle counter", "Useful when emulating rdtsc", "Be careful, the frequency of this counter changes based on host", "On AArch64 make sure to query the CNTFRQ_EL0 system register to get the frequency", "On x86-64 make sure to query CPUID fn8000_0008[EDX_8] for constant TSC", "x86-64 constant frequency lives in MSR_PLATFORM_INFO. Which is only available to kernel", "Part of the ART frequency equation can be pulled from CPUID fn0000_0015[EBX & EAX]", "But it's missing the ART multiplier still?", "If the self-synchronizing flag is toggled then all instructions and loads must be completed before the cycle counter read" ], "DestSize": "OpSize::i64Bit" }, "GPR = Neg OpSize:#Size, GPR:$Src, CondClass:$Cond{CondClass::AL}": { "Desc": ["Integer negation, with optional predication", "Dest = Cond ? -Src : Src", "Will truncate to 64 or 32bits" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Not OpSize:#Size, GPR:$Src": { "Desc": ["Integer binary not", "op:", "Dest = ~Src" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Popcount OpSize:#Size, GPR:$Src": { "Desc": ["Population count of source register", "Returns the number of bits set" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = FindLSB OpSize:#Size, GPR:$Src": { "Desc": ["Find least-significant-bit set", "Returns the index of the least significant bit set", "Undefined result if Src is zero." ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = FindMSB OpSize:#Size, GPR:$Src": { "Desc": ["Find most-significant-bit set", "Returns the index of the most significant bit set", "Undefined result if Src is zero." ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = FindTrailingZeroes OpSize:#Size, GPR:$Src": { "Desc": ["Counts the number of trailing zero bits in a GPR", "Returns the number of bits that are zero trailing", "In the case of zero returns the size in bits of the input" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = CountLeadingZeroes OpSize:#Size, GPR:$Src": { "Desc": ["Counts the number of leading zero bits in a GPR", "Returns the number of bits that are zero leading", "In the case of zero returns the size in bits of the input" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Rev OpSize:#Size, GPR:$Src": { "Desc": ["Reverses the byte order of the register", "Specifically 8bit byte swap size. (Not 16bit or 32bit word swapping)" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Rbit OpSize:#Size, GPR:$Src": { "Desc": ["Reverses the bit order of the register"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Add OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": [ "Integer Add", "Will truncate to 64 or 32bits" ], "Inline": ["", "LargeAddSub"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Adc OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": [ "Integer Add with carry", "Will truncate to 64 or 32bits" ], "Inline": ["Zero", ""], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Sbb OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": [ "Integer Subtract with carry/borrow", "Will truncate to 64 or 32bits" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AddShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": { "Desc": [ "Integer Add with shifted register", "Will truncate to 64 or 32bits", "Dest = Src1 + (Src2 << ShiftAmount)" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit", "Shift != ShiftType::ROR" ] }, "GPR = AddWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": [ "Integer add. Truncates and sets NZCV per AddNZCV"], "Inline": ["", "LargeAddSub"], "DestSize": "Size", "HasSideEffects": true, "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "AddNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the sum of two GPRs"], "Inline": ["", "LargeAddSub"], "HasSideEffects": true, "DestSize": "Size" }, "SetSmallNZV OpSize:#Size, GPR:$Src": { "Desc": ["Set NZV with a SETF instruction. Preserves CF."], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i8Bit || Size == FEXCore::IR::OpSize::i16Bit" ] }, "CarryInvert": { "Desc": ["Invert carry flag in NZCV"], "HasSideEffects": true }, "AXFlag GPR:$V_inv": { "Desc": ["After an FCmp, converts NZCV flags from the Arm format to a mysterious eXternal format", "On FlagM2-less platforms, takes the inverted 1/0 overflow flag"], "HasSideEffects": true }, "GPR = Parity GPR:$Raw, i1:$Mask, i1:$Invert": { "Desc": ["Calculates PF"], "DestSize": "OpSize::i32Bit" }, "RmifNZCV GPR:$Src, u8:$Rotate, u8:$Mask": { "Desc": ["Rotate, mask, and insert into NZCV on FlagM platforms"], "Inline": ["Zero", ""], "HasSideEffects": true }, "CondAddNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2, CondClass:$Cond, u8:$FalseNZCV": { "Desc": ["If condition is true, set NZCV per sum of GPRs, else force NZCV to a constant."], "Inline": ["Zero", "AddSub"], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "CondSubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2, CondClass:$Cond, u8:$FalseNZCV": { "Desc": ["If condition is true, set NZCV per difference of GPRs, else force NZCV to a constant."], "Inline": ["Zero", "AddSub"], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AdcWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Adds and set NZCV for the sum of two GPRs and carry-in given as NZCV"], "Inline": ["Zero", ""], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AdcZero OpSize:#Size, GPR:$Src1": { "Desc": ["Adds GPR with inverted carry-in"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AdcZeroWithFlags OpSize:#Size, GPR:$Src1": { "Desc": ["Adds and set NZCV for the sum of GPR and inverted carry-in given as NZCV"], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = SbbWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Subtracts and set NZCV for the difference of two GPRs and carry-in given as NZCV"], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "AdcNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the sum of two GPRs and carry-in given as NZCV"], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "SbbNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the difference of two GPRs and carry-in given as NZCV"], "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Sub OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": [ "Integer Sub", "Will truncate to 64 or 32bits" ], "Inline": ["SubtractZero", "LargeAddSub"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = SubShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": { "Desc": [ "Integer Sub with shifted register", "Will truncate to 64 or 32bits" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit", "Shift != ShiftType::ROR" ] }, "GPR = SubWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": [ "Integer Sub. Truncates and sets NZCV per SubNZCV"], "Inline": ["SubtractZero", "LargeAddSub"], "DestSize": "Size", "HasSideEffects": true, "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "CmpPairZ OpSize:#Size, GPR:$Src1Lo, GPR:$Src1Hi, GPR:$Src2Lo, GPR:$Src2Hi": { "Desc": ["Compares register pairs and sets Z accordingly, preserving N/Z/V.", "This accelerates cmpxchg."], "HasSideEffects": true, "DestSize": "Size" }, "SubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the difference of two GPRs. ", "Carry flag uses arm64 definition, inverted x86.", ""], "Inline": ["Zero", "LargeAddSub"], "DestSize": "Size", "HasSideEffects": true }, "GPR = Or OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer binary or" ], "DestSize": "Size", "Inline": ["", "Logical"], "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Orlshl OpSize:#Size, GPR:$Src1, GPR:$Src2, u8:$BitShift": { "Desc": ["Integer binary or with logical shift left" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Orlshr OpSize:#Size, GPR:$Src1, GPR:$Src2, u8:$BitShift": { "Desc": ["Integer binary or with logical shift right" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Ornror OpSize:#Size, GPR:$Src1, GPR:$Src2, u8:$BitShift": { "Desc": ["Integer binary or with NOT on second source and rotation right" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Xor OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer binary exclusive or"], "Inline": ["", "Logical"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = XorShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": { "Desc": [ "Integer binary exclusive or with shifted register"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = XornShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": { "Desc": [ "Integer binary exclusive or not with shifted register"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = And OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer binary and"], "Inline": ["", "Logical"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AndShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": { "Desc": [ "Integer binary and with shifted register"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = AndWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer binary and" ], "Inline": ["", "Logical"], "DestSize": "Size", "TiedSource": 0, "HasSideEffects": true }, "GPR = Andn OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer binary AND NOT. Performs the equivalent of Src1 & ~Src2"], "DestSize": "Size", "Inline": ["", "Logical"], "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "TestNZ OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the binary AND of two GPRs, setting N and Z accordingly and zeroing C and V"], "Inline": ["", "Logical"], "DestSize": "Size", "HasSideEffects": true }, "TestZ OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the binary AND of two GPRs, setting Z accordingly and zeroing C and V. N is undefined."], "DestSize": "Size", "HasSideEffects": true }, "GPR = Lshl OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer logical shift left"], "Inline": ["", "Any"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Lshr OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer logical shift right"], "Inline": ["", "Any"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Ashr OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer arithmetic shift right"], "Inline": ["", "Any"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = ShiftFlags OpSize:$Size, GPR:$Result, GPR:$Src1, ShiftType:$Shift, GPR:$Src2, GPR:$PFInput, i1:$InvertCF": { "Desc": ["Set NZCV flags for specified variable integer shift with given result.", "Returns updated raw PF."], "HasSideEffects": true, "DestSize": "OpSize::i64Bit" }, "RotateFlags OpSize:$Size, GPR:$Result, GPR:$Shift, i1:$Left": { "Desc": ["Set NZCV flags for specified variable integer rotate with given result."], "HasSideEffects": true }, "GPR = Ror OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer rotate right"], "Inline": ["", "Any"], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Mul OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer signed multiplication" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = UMul OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer unsigned multiplication" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = UMull GPR:$Src1, GPR:$Src2": { "Desc": ["Integer unsigned multiplication long", "Multiplies two 32-bit numbers, returning a 64-bit destination register." ], "DestSize": "FEXCore::IR::OpSize::i64Bit" }, "GPR = SMull GPR:$Src1, GPR:$Src2": { "Desc": ["Integer signed multiplication long", "Multiplies two 32-bit numbers, returning a 64-bit destination register." ], "DestSize": "FEXCore::IR::OpSize::i64Bit" }, "GPR = MulH OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer signed multiply returning high results", "op:", "Tmp = Src1 * Src2;", "Dest = Tmp >> (size * 8);" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = UMulH OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer unsigned multiply returning high results", "op:", "Tmp = Src1 * Src2;", "Dest = Tmp >> (size * 8);" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Bfi OpSize:#Size, u8:$Width, u8:$lsb, GPR:$Dest, GPR:$Src": { "Desc": ["Copies a bitfield from one GPR to another", "The source bitfield is from Src[Width:0]", "The bitfield is copied in to Dest[(Width + lsb):lsb]" ], "DestSize": "Size", "TiedSource": 0, "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit", "(Width + lsb) <= IR::OpSizeAsBits(Size)" ] }, "GPR = Bfxil OpSize:#Size, u8:$Width, u8:$lsb, GPR:$Dest, GPR:$Src": { "Desc": ["Copies a bitfield from one GPR to another", "Inserting in to the low bits of the destination", "The source bitfield is from Src[(Width + lsb):lsb]", "The bitfield is copied in to Dest[Width:0]" ], "DestSize": "Size", "TiedSource": 0, "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit", "(Width + lsb) <= IR::OpSizeAsBits(Size)" ] }, "GPR = Bfe OpSize:#Size, u8:$Width, u8:$lsb, GPR:$Src": { "Desc": ["Extracts a bitfield from one GPR with zext", "The source bitfield is from Src[Width:0]", "The bitfield is then zero extended" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit", "(Width + lsb) <= IR::OpSizeAsBits(Size)" ] }, "GPR = Sbfe OpSize:#Size, u8:$Width, u8:$lsb, GPR:$Src": { "Desc": ["Extracts a bitfield from one GPR with sext", "The source bitfield is from Src[Width:0]", "The bitfield is then sign extended" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit", "(Width + lsb) <= IR::OpSizeAsBits(Size)" ] }, "GPR = NZCVSelect OpSize:#ResultSize, CondClass:$Cond, GPR:$TrueVal, GPR:$FalseVal": { "Desc": ["Select based on value in NZCV flags", "op:", "Dest = Cond ? TrueVal : FalseVal" ], "DestSize": "ResultSize", "EmitValidation": [ "ResultSize == FEXCore::IR::OpSize::i32Bit || ResultSize == FEXCore::IR::OpSize::i64Bit" ] }, "FPR = NZCVSelectV OpSize:#ResultSize, CondClass:$Cond, FPR:$TrueVal, FPR:$FalseVal": { "Desc": [ "Select based on value in NZCV flags, where TrueVal and FalseVal are both FPRs.", "op:", "Dest = Cond ? TrueVal : FalseVal" ], "DestSize": "ResultSize" }, "GPR = NZCVSelectIncrement OpSize:#ResultSize, CondClass:$Cond, GPR:$TrueVal, GPR:$FalseVal": { "Desc": ["Select and increment based on value in NZCV flags", "op:", "Dest = Cond ? TrueVal : (FalseVal + 1)" ], "DestSize": "ResultSize", "EmitValidation": [ "ResultSize == FEXCore::IR::OpSize::i32Bit || ResultSize == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = Select OpSize:#ResultSize, OpSize:$CompareSize, CondClass:$Cond, SSA:$Cmp1, SSA:$Cmp2, GPR:$TrueVal, GPR:$FalseVal": { "Desc": ["Ternary selection of GPRs", "op:", "Dest = Cmp1 Cmp2 ? TrueVal : FalseVal" ], "Inline": ["", "AddSub", "", ""], "DestSize": "ResultSize", "ImplicitFlagClobber": true, "EmitValidation": [ "CompareSize == FEXCore::IR::OpSize::i32Bit || CompareSize == FEXCore::IR::OpSize::i64Bit || CompareSize == FEXCore::IR::OpSize::i128Bit", "ResultSize == FEXCore::IR::OpSize::i32Bit || ResultSize == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = MaskGenerateFromBitWidth GPR:$BitWidth": { "Desc": ["Generates a bit mask from with a value from [0, 63]", "0 is special cased to full-mask", "Special operation for SSE4a bitmask generation." ], "DestSize": "FEXCore::IR::OpSize::i64Bit", "ImplicitFlagClobber": true }, "GPR = Extr OpSize:#Size, GPR:$Upper, GPR:$Lower, u8:$LSB": { "Desc": ["Concats the two GPRs to create a value that is the size of the full two GPRs", "It then extracts a bitfield width that size of a GPR from the LSB", "Valid LSB range is 0-31 for 32bit and 0-63 for 64bit", " ConcatValue = $Upper:$Lower", "Result = ConcatValue" ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = PDep OpSize:#Size, GPR:$Input, GPR:$Mask": { "Desc": ["Performs a parallel bit deposit.", "Takes the contiguous low-order bits and deposits them into", "the destination at the locations specified by the Mask." ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR = PExt OpSize:#Size, GPR:$Input, GPR:$Mask": { "Desc": ["Performs a parallel bit extract.", "Each bit set in the mask will select the corresponding bit in the Input", "and transfers them to the lower contiguous bits in the destination." ], "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, "GPR:$Quotient, GPR:$Remainder = Div OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long signed division returning lower bits", "The Lower and Upper registers will be concated together to generate a dividend twice the size", "Then the divisor divides the temporary dividend and returns the results in the original sized register", "If Upper is invalid, this is a non-long division." ], "DestSize": "Size", "HasSideEffects": true }, "GPR:$Quotient, GPR:$Remainder = UDiv OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned division returning lower bits", "The Lower and Upper registers will be concated together to generate a dividend twice the size", "Then the divisor divides the temporary dividend and returns the results in the original sized register", "If Upper is invalid, this is a non-long division." ], "DestSize": "Size", "HasSideEffects": true }, "Float to GPR": {"Ignore": 1}, "GPR = VExtractToGPR OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$Index": { "Desc": ["Extracts an element from a vector and places it in a GPR", "The element that is extracted from the vector is zero extended to the GPR size" ], "DestSize": "ElementSize" }, "GPR = Float_ToGPR_S OpSize:#DestElementSize, OpSize:$SrcElementSize, FPR:$Scalar": { "Desc": ["Moves the scalar element to a GPR with conversion", "Converts the 32bit or 64bit float to an signed integer", "Rounding mode determined by host flag's rounding mode" ], "DestSize": "DestElementSize" }, "GPR = Float_ToGPR_ZS OpSize:#DestElementSize, OpSize:$SrcElementSize, FPR:$Scalar": { "Desc": ["Moves the scalar element to a GPR with conversion", "Converts the 32bit or 64bit float to an signed integer rounding towards zero (Truncating)" ], "DestSize": "DestElementSize" }, "FCmp OpSize:$ElementSize, FPR:$Scalar1, FPR:$Scalar2": { "Desc": ["Does a scalar unordered compare and sets NZCV accordingly.", "NZCV follows Arm conventions, a separate AXFLAG instruction is required for x86", "Ordering flag result is true if either float input is NaN" ], "HasSideEffects": true } }, "VectorScalar": { "FPR = VFAddScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'add' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFSubScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'sub' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFMulScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'mul' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFDivScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'div' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFMinScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'min' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics.", "Additionally matches x86 zero and NaN semantics", "If both source operands are zero, return the second operand (in the case of negative and positive zero)", "If either source operand is NaN then return the second operand." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "ImplicitFlagClobber": true }, "FPR = VFMaxScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'max' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics.", "Additionally matches x86 zero and NaN semantics", "If both source operands are zero, return the second operand (in the case of negative and positive zero)", "If either source operand is NaN then return the second operand." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "ImplicitFlagClobber": true }, "FPR = VFSqrtScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'sqrt' on Vector2, inserting in to Vector1 and storing in to the destination.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFRSqrtScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'rsqrt' on Vector2, inserting in to Vector1 and storing in to the destination.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFRecpScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'recip' on Vector2, inserting in to Vector1 and storing in to the destination.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFToFScalarInsert OpSize:#RegisterSize, OpSize:#DstElementSize, OpSize:$SrcElementSize, FPR:$Vector1, FPR:$Vector2, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'cvt' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "DstElementSize" }, "FPR = VSToFVectorInsert OpSize:#RegisterSize, OpSize:#DstElementSize, OpSize:$SrcElementSize, FPR:$Vector1, FPR:$Vector2, i8:$HasTwoElements, i1:$ZeroUpperBits": { "Desc": ["Does a Vector 'scvt' between Vector1 and Vector2.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics.", "HasTwoElements is slightly different than most of these scalar operations.", "Handles the edge case of cvtpi2ps xmm0, mm0 which is two elements in the lower 64-bits" ], "DestSize": "RegisterSize", "ElementSize": "DstElementSize" }, "FPR = VSToFGPRInsert OpSize:#RegisterSize, OpSize:#DstElementSize, OpSize:$SrcElementSize, FPR:$Vector, GPR:$Src, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'cvt' between Vector1 and GPR.", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "DstElementSize" }, "FPR = VFToIScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, RoundType:$Round, i1:$ZeroUpperBits": { "Desc": ["Does a scalar round float to integral on Vector2, inserting in to Vector1 and storing in to the destination.", "Rounding mode determined by argument", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, FloatCompareOp:$Op, i1:$ZeroUpperBits": { "Desc": ["Does a scalar 'cmp' between Vector1 and Vecto2, inserting in to Vector1 and storing in to the destination.", "Compare op determined by argument", "Inserting the result in to the lower element of Vector1 and returning the results.", "If ZeroUpperBits is set then in a 256-bit wide operation it will zero the upper 128-bits of the destination.", "For 128-bit operation this matches SSE insert semantics.", "For 256-bit operation with ZeroUpperBits, this matches AVX insert semantics." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFMLAScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Upper, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (Vector1 * Vector2) + Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending.", "Upper elements copied from Upper" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 }, "FPR = VFMLSScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Upper, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (Vector1 * Vector2) - Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending.", "Upper elements copied from Upper" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 }, "FPR = VFNMLAScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Upper, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (-Vector1 * Vector2) + Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending.", "Upper elements copied from Upper" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 }, "FPR = VFNMLSScalarInsert OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Upper, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (-Vector1 * Vector2) - Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending.", "Upper elements copied from Upper" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 }, "FPR = VFCopySign OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": ["Returns a vector where each element has has the magniture of each corresponding element in vector1 and the sign of vector 2."], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 } }, "Vector": { "FPR = VMov OpSize:#RegisterSize, FPR:$Source": { "Desc" : ["Copy vector register", "When Register size is smaller than Source register size,", "this op is defined to truncate and zero extend" ], "DestSize": "RegisterSize" }, "FPR = VectorImm OpSize:#RegisterSize, OpSize:#ElementSize, u8:$Immediate, u8:$ShiftAmount{0}": { "Desc": ["Generates a vector with each element containg the immediate zexted" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = LoadNamedVectorConstant OpSize:#RegisterSize, NamedVectorConstant:$Constant": { "Desc": ["Load a named vector constant.", "The list of vector constants can be found in " ], "DestSize": "RegisterSize" }, "FPR = LoadNamedVectorIndexedConstant OpSize:#RegisterSize, IndexNamedVectorConstant:$Constant, u32:$Index": { "Desc": ["Load a named vector constant from Indexable table.", "Index needs to be aligned register size.", "The list of indexable vector constants can be found in " ], "DestSize": "RegisterSize" }, "FPR = VNeg OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VNot OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VAbs OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Does an signed integer absolute" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VPopcount OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Does a popcount for each element of the register" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VAddV OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Does a horizontal vector add of elements across the source vector", "Result is a zero extended scalar" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUMinV OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Does a horizontal vector unsigned minimum of elements across the source vector", "Result is a zero extended scalar" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUMaxV OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Does a horizontal vector unsigned maximum of elements across the source vector", "Result is a zero extended scalar" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFAbs OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFNeg OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFRecp OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": [ "Reciprocal value - matches the precision required by the x86 spec.", "It has a relative error of at most 1.5 * 2^-12" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFRecpPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": [ "Similar to VFRecp but carrying more precision for 3DNow!", "It provides at least 14 bits precision, with a relative error of at most 2^-14" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit", "ElementSize == FEXCore::IR::OpSize::i32Bit" ] }, "FPR = VFSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFRSqrt OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": [ "Reciprocal Square Root - matches the precision required by the x86 spec.", "It has a relative error of at most 1.5 * 2^-12" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFRSqrtPrecision OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": [ "Similar to VFRSqrt but carrying more precision for 3DNow!", "It provides at least 15 bits precision, with a relative error of at most 2^-15" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i64Bit || RegisterSize == FEXCore::IR::OpSize::i32Bit", "ElementSize == FEXCore::IR::OpSize::i32Bit" ] }, "FPR = VCMPEQZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VCMPGTZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Vector compare signed greater than", "Each element is compared, if the result is true then the resulting element is ~0, else zero", "Compares the vector against zero" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VCMPLTZ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Vector compare signed less than", "Each element is compared, if the result is true then the resulting element is ~0, else zero", "Compares the vector against zero" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VDupElement OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$Index": { "Desc": ["Duplicates one element from the source register across the whole register"], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VShlI OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "ElementSize >= FEXCore::IR::OpSize::i8Bit && ElementSize <= FEXCore::IR::OpSize::i64Bit", "BitShift > 0" ] }, "FPR = VUShrI OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "ElementSize >= FEXCore::IR::OpSize::i8Bit && ElementSize <= FEXCore::IR::OpSize::i64Bit", "BitShift > 0" ] }, "FPR = VUShraI OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$DestVector, FPR:$Vector, u8:$BitShift": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "ElementSize >= FEXCore::IR::OpSize::i8Bit && ElementSize <= FEXCore::IR::OpSize::i64Bit", "BitShift > 0 && BitShift <= IR::OpSizeAsBits(ElementSize)" ] }, "FPR = VSShrI OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "ElementSize >= FEXCore::IR::OpSize::i8Bit && ElementSize <= FEXCore::IR::OpSize::i64Bit", "BitShift > 0" ] }, "FPR = VUShrNI OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "TiedSource": 0, "Desc": "Unsigned shifts right each element and then narrows to the next lower element size", "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1", "EmitValidation": [ "ElementSize >= FEXCore::IR::OpSize::i16Bit && ElementSize <= FEXCore::IR::OpSize::i64Bit", "BitShift <= IR::OpSizeAsBits(ElementSize)" ] }, "FPR = VUShrNI2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper, u8:$BitShift": { "TiedSource": 0, "Desc": ["Unsigned shifts right each element and then narrows to the next lower element size", "Inserts results in to the high elements of the first argument" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1", "EmitValidation": [ "ElementSize >= FEXCore::IR::OpSize::i16Bit && ElementSize <= FEXCore::IR::OpSize::i64Bit", "BitShift > 0 && BitShift <= IR::OpSizeAsBits(ElementSize)" ] }, "FPR = VSXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": "Sign extends elements from the source element size to the next size up", "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VSXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Sign extends elements from the source element size to the next size up", "Source elements come from the upper half of the register" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VSSHLL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": { "Desc": "Sign extends elements from the source element size to the next size up", "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VSSHLL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift{0}": { "Desc": ["Sign extends elements from the source element size to the next size up", "Source elements come from the upper half of the register" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VUXTL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": "Zero extends elements from the source element size to the next size up", "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VUXTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Zero extends elements from the source element size to the next size up", "Source elements come from the upper half of the register" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VSQXTN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1" }, "FPR = VSQXTN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1" }, "FPR = VSQXTNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "Desc": ["Does both VSQXTN and VSQXTN2 in a combined operation." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1" }, "FPR = VSQXTUN OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1" }, "FPR = VSQXTUN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1" }, "FPR = VSQXTUNPair OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "Desc": ["Does both VSQXTUN and VSQXTUN2 in a combined operation." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1" }, "FPR = VSRSHR OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "Desc": ["Signed rounding shift right by immediate", "Exactly matching Arm64 srshr semantics" ], "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSQSHL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, u8:$BitShift": { "Desc": ["Signed satuating shift left by immediate", "Exactly matching Arm64 sqshl semantics" ], "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VRev32 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc" : ["Reverses elements in 32-bit halfwords", "Available element size: 1byte, 2 byte" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VRev64 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc" : ["Reverses elements in 64-bit halfwords", "Available element size: 1byte, 2 byte, 4 byte" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VAdd OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSub OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VAnd OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i256Bit || RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i64Bit" ] }, "FPR = VAndn OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i256Bit || RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i64Bit" ] }, "FPR = VOrn OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i256Bit || RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i64Bit" ] }, "FPR = VOr OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i256Bit || RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i64Bit" ] }, "FPR = VXor OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "EmitValidation": [ "RegisterSize == FEXCore::IR::OpSize::i256Bit || RegisterSize == FEXCore::IR::OpSize::i128Bit || RegisterSize == FEXCore::IR::OpSize::i64Bit" ] }, "FPR = VUQAdd OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUQSub OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSQAdd OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSQSub OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VAddP OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "Desc": "Does a horizontal pairwise add of elements across the two source vectors", "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VURAvg OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": ["Does an unsigned rounded average", "dst_elem = (src1_elem + src2_elem + 1) >> 1"], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUMin OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUMax OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSMin OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSMax OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VZip OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VZip2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUnZip OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUnZip2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VTrn OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VTrn2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFAdd OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFAddP OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "Desc": "Does a horizontal pairwise add of elements across the two source vectors with float element types", "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFAddV OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Does a horizontal float vector add of elements across the source vector", "Result is a zero extended scalar" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFSub OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFMul OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFDiv OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFMin OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 }, "FPR = VFMax OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 0 }, "FPR = VMul OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VSMull OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": [ "Does a signed integer multiply with extend.", "ElementSize is the source size" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VUMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Multiplies the high elements with size extension", "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VSMull2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Multiplies the high elements with size extension", "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VUMulH OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Wide unsigned multiply returning the high results", "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSMulH OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": "Wide signed multiply returning the high results", "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUABDL OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": ["Unsigned Absolute Difference Long" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VUABDL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": ["Unsigned Absolute Difference Long", "Using the high elements of the source vectors" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1" }, "FPR = VUShl OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftVector, i1:$RangeCheck": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUShr OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftVector, i1:$RangeCheck": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSShr OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftVector, i1:$RangeCheck": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUShlS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftScalar": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUShrS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftScalar": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSShrS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftScalar": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUShrSWide OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftScalar": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VSShrSWide OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftScalar": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VUShlSWide OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, FPR:$ShiftScalar": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VInsElement OpSize:#RegisterSize, OpSize:#ElementSize, u8:$DestIdx, u8:$SrcIdx, FPR:$DestVector, FPR:$SrcVector": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VInsGPR OpSize:#RegisterSize, OpSize:#ElementSize, u8:$DestIdx, FPR:$DestVector, GPR:$Src": { "TiedSource": 0, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VExtr OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper, u8:$Index": { "Desc": ["Concats two vector registers together and extracts a full width register from the element index", "Index is an element index. So it is offset by ElementSize argument", "op:", "TmpVector = concat(Upper:Lower)", "Dest = TmpVector >> (ElementSize * Index * 8); // Or can be thought of `concat(&TmpVector[Index], i128)`" ], "TiedSource": 1, "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VCMPEQ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VCMPGT OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "Desc": ["Vector compare signed greater than", "Each element is compared, if the result is true then the resulting element is ~0, else zero" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPEQ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPNEQ OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPLT OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPGT OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPLE OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPORD OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFCMPUNO OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VTBL1 OpSize:#RegisterSize, FPR:$VectorTable, FPR:$VectorIndices": { "Desc": ["Does a vector table lookup from one register in to the destination", "Lookup is byte sized per byte element.", "Any index larger than what the registers provide will result in zero for that element", "Table is always treated as a 128bit register", "Indices matches destination size. Either 64bit or 128bit" ], "DestSize": "RegisterSize" }, "FPR = VTBL2 OpSize:#RegisterSize, FPR:$VectorTable1, FPR:$VectorTable2, FPR:$VectorIndices": { "Desc": ["Does a vector table lookup from two registers in to the destination", "Lookup is byte sized per byte element.", "Any index larger than what the registers provide will result in zero for that element", "Table is always treated as a two 128bit registers", "Indices matches destination size. Either 64bit or 128bit", "Careful about not using sequential table registers, will result in some moves if they aren't sequential." ], "DestSize": "RegisterSize" }, "FPR = VTBX1 OpSize:#RegisterSize, FPR:$VectorSrcDst, FPR:$VectorTable, FPR:$VectorIndices": { "Desc": ["Does a vector table lookup from one register in to the destination", "Lookup is byte sized per byte element.", "Any index larger than what the registers provide will result in not modifying that element", "Table is always treated as a 128bit register", "Indices matches destination size. Either 64bit or 128bit" ], "TiedSource": 0, "DestSize": "RegisterSize" }, "FPR = VBSL OpSize:#RegisterSize, FPR:$VectorMask, FPR:$VectorTrue, FPR:$VectorFalse": { "Desc": ["Does a vector bitwise select.", "If the bit in the field is 1 then the corresponding bit is pulled from VectorTrue", "If the bit in the field is 0 then the corresponding bit is pulled from VectorFalse" ], "TiedSource": 0, "DestSize": "RegisterSize" }, "GPR = VPCMPESTRX FPR:$LHS, FPR:$RHS, GPR:$RAX, GPR:$RDX, u16:$Control": { "Desc": ["Performs intermediate behavior analogous to the x86 PCMPESTRI/PCMPESTRM instruction", "This will return the intermediate result of a PCMPESTR-type operation, but NOT the final", "result. This must be derived from the intermediate result", "NOTE: On top of returning the intermediate result, the returned value also combines the status", "flags into the upper 16-bits of the 32-bit result, as these can also be derived over the", "course of creating the intermediate result" ], "DestSize": "OpSize::i32Bit", "JITDispatch": false }, "GPR = VPCMPISTRX FPR:$LHS, FPR:$RHS, u8:$Control": { "Desc": ["Performs intermediate behavior analogous to the x86 PCMPISTRI/PCMPISTRM instruction", "This will return the intermediate result of a PCMPISTR-type operation, but NOT the final", "result. This must be derived from the intermediate result", "NOTE: On top of returning the intermediate result, the returned value also combines the status", "flags into the upper 16-bits of the 32-bit result, as these can also be derived over the", "course of creating the intermediate result" ], "DestSize": "OpSize::i32Bit", "JITDispatch": false }, "FPR = VFCADD OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, u16:$Rotate": { "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VFMLA OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (Vector1 * Vector2) + Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 2 }, "FPR = VFMLS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (Vector1 * Vector2) - Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 2 }, "FPR = VFNMLA OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (-Vector1 * Vector2) + Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 2 }, "FPR = VFNMLS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": { "Desc": [ "Dest = (-Vector1 * Vector2) - Addend", "This explicitly matches x86 FMA semantics because ARM semantics are mind-bending." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize", "TiedSource": 2 } }, "Conv": { "FPR = VCastFromGPR OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Src": { "Desc": ["Moves a GPR to a Vector register with zero extension to full length of the register.", "No conversion is done on the data as it moves register files" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VDupFromGPR OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Src": { "Desc": ["Broadcasts a value in a GPR into each ElementSize-sized element in a vector"], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = VLoadTwoGPRs GPR:$Lower, GPR:$Upper": { "Desc": ["Moves two 64-bit registers to a vector register optimally"], "DestSize": "OpSize::i128Bit", "ElementSize": "OpSize::i64Bit" }, "FPR = Float_FromGPR_S OpSize:#DstElementSize, OpSize:$SrcElementSize, GPR:$Src": { "Desc": ["Scalar op: Converts signed GPR to Scalar float", "Zeroes the upper bits of the vector register" ], "DestSize": "DstElementSize" }, "FPR = Float_FToF OpSize:#DstElementSize, OpSize:$SrcElementSize, FPR:$Scalar": { "Desc": ["Scalar op: Converts float from one size to another", "Zeroes the upper bits of the vector register" ], "DestSize": "DstElementSize" }, "FPR = Vector_SToF OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": "Vector op: Converts signed integer to same size float", "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = Vector_FToS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": ["Vector op: Converts float to signed integer, rounding towards zero", "Rounding mode determined by host rounding mode" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = Vector_FToZS OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": "Vector op: Converts float to signed integer, rounding towards zero", "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = Vector_FToF OpSize:#RegisterSize, OpSize:#DestElementSize, FPR:$Vector, OpSize:$SrcElementSize": { "Desc": "Vector op: Converts float from source element size to destination size (fp32<->fp64)", "DestSize": "RegisterSize", "ElementSize": "DestElementSize" }, "FPR = VFCVTL2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector": { "Desc": [ "Vector op: Converts float from source element size to destination size (fp32->fp64)", "Selecting from the high half of the register." ], "DestSize": "RegisterSize", "ElementSize": "ElementSize << 1", "EmitValidation": [ "RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\"" ] }, "FPR = VFCVTN2 OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$VectorLower, FPR:$VectorUpper": { "TiedSource": 0, "Desc": [ "Vector op: Converts float from source element size and inserting in to the high bits.", "Bottom half is untouched", "Narrowing to the element size below what is passed in.", "F64->F32, F32->F16" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize >> 1", "EmitValidation": [ "RegisterSize != FEXCore::IR::OpSize::i256Bit && \"What does 256-bit mean in this context?\"" ] }, "FPR = Vector_FToI OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, RoundType:$Round": { "Desc": ["Vector op: Rounds float to integral", "Rounding mode determined by argument" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = Vector_FToISized OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Vector, i1:$HostRound, OpSize:$IntSize": { "Desc": ["Vector op: Rounds float to sized integral", "Either host rounding or round-to-zero", "Rounding mode determined by argument" ], "DestSize": "RegisterSize", "ElementSize": "ElementSize" }, "FPR = Vector_F64ToI32 OpSize:#RegisterSize, FPR:$Vector, RoundType:$Round, i1:$EnsureZeroUpperHalf": { "Desc": ["Vector op: Rounds 64-bit float to 32-bit integral with round mode", "Matches CVTPD2DQ/CVTTPD2DQ behaviour" ], "DestSize": "RegisterSize", "ElementSize": "FEXCore::IR::OpSize::i32Bit" } }, "Crypto": { "FPR = VAESImc FPR:$Vector": { "Desc": "Does a stage of the inverse mix column transformation", "DestSize": "OpSize::i128Bit" }, "FPR = VAESEnc OpSize:#RegisterSize, FPR:$State, FPR:$Key, FPR:$ZeroReg": { "Desc": "Does a step of AES encryption", "DestSize": "RegisterSize" }, "FPR = VAESEncLast OpSize:#RegisterSize, FPR:$State, FPR:$Key, FPR:$ZeroReg": { "Desc": "Does the last step of AES encryption", "DestSize": "RegisterSize" }, "FPR = VAESDec OpSize:#RegisterSize, FPR:$State, FPR:$Key, FPR:$ZeroReg": { "Desc": "Does a step of AES decryption", "DestSize": "RegisterSize" }, "FPR = VAESDecLast OpSize:#RegisterSize, FPR:$State, FPR:$Key, FPR:$ZeroReg": { "Desc": "Does the last step of AES decryption", "DestSize": "RegisterSize" }, "FPR = VAESKeyGenAssist FPR:$Src, FPR:$KeyGenTBLSwizzle, FPR:$ZeroReg, u8:$RCON": { "Desc": "Assists in key generation", "DestSize": "OpSize::i128Bit" }, "FPR = VSha1H FPR:$Src": { "Desc": "Does vector scalar SHA1H instruction", "DestSize": "FEXCore::IR::OpSize::i32Bit" }, "FPR = VSha1C FPR:$Src1, FPR:$Src2, FPR:$Src3": { "Desc": "Does vector SHA1C instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "FPR = VSha1M FPR:$Src1, FPR:$Src2, FPR:$Src3": { "Desc": "Does vector SHA1M instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "FPR = VSha1P FPR:$Src1, FPR:$Src2, FPR:$Src3": { "Desc": "Does vector SHA1P instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "FPR = VSha1SU1 FPR:$Src1, FPR:$Src2": { "Desc": "Does vector scalar SHA1H instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "FPR = VSha256U0 FPR:$Src1, FPR:$Src2": { "Desc": "Does vector scalar VSha256U0 instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "FPR = VSha256U1 FPR:$Src1, FPR:$Src2": { "Desc": "Does vector scalar VSha256U1 instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit" }, "FPR = VSha256H FPR:$Src1, FPR:$Src2, FPR:$Src3": { "Desc": "Does vector scalar VSha256H instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "FPR = VSha256H2 FPR:$Src1, FPR:$Src2, FPR:$Src3": { "Desc": "Does vector scalar VSha256H2 instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", "TiedSource": 0 }, "GPR = CRC32 GPR:$Src1, GPR:$Src2, OpSize:$SrcSize": { "Desc": ["CRC32 using polynomial 0x1EDC6F41" ], "DestSize": "OpSize::i32Bit" }, "FPR = PCLMUL OpSize:#RegisterSize, FPR:$Src1, FPR:$Src2, u8:$Selector": { "Desc": [ "Performs carryless multiplication of 64-bit elements depending on the selector.", "Selector = 0b00000000: Uses low 64-bit elements from both input vectors", "Selector = 0b00000001: Uses high 64-bit element from Src1 and low 64-bit element from Src2", "Selector = 0b00010000: Uses low 64-bit element from Src1 and high 64-bit element from Src2", "Selector = 0b00010001: Uses high 64-bit elements from both input vectors" ], "DestSize": "RegisterSize" } }, "F64": { "FPR = F64ATAN FPR:$Src1, FPR:$Src2": { "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = F64FPREM FPR:$Src1, FPR:$Src2": { "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = F64FPREM1 FPR:$Src1, FPR:$Src2": { "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = F64SCALE FPR:$Src1, FPR:$Src2": { "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = F64F2XM1 FPR:$Src": { "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = F64FYL2X FPR:$Src, FPR:$Src2": { "DestSize": "OpSize::i64Bit", "JITDispatch": false }, "FPR = F64TAN FPR:$Src": { "DestSize": "OpSize::i64Bit", "JITDispatch": true }, "FPR = F64SIN FPR:$Src": { "DestSize": "OpSize::i64Bit", "JITDispatch": true }, "FPR = F64COS FPR:$Src": { "DestSize": "OpSize::i64Bit", "JITDispatch": true }, "FPR:$Sin, FPR:$Cos = F64SINCOS FPR:$Src": { "DestSize": "OpSize::i64Bit", "HasSideEffects": true, "JITDispatch": false } }, "F80": { "GPR = SyncStackToSlow": { "Desc": [ "Synchronizes the virtual stack environment to the physical registers.", "Returns the current stack top." ], "X87": true, "HasSideEffects": true, "DestSize": "OpSize::i64Bit" }, "StackForceSlow": { "Desc": [ "Forces the slow path." ], "X87": true, "HasSideEffects": true }, "InitStack": { "Desc": [ "Initializes the stack by marking all tags as invalid and setting top to zero." ], "X87": true, "HasSideEffects": true }, "IncStackTop": { "Desc": [ "Increase stack top-pointer." ], "X87": true, "HasSideEffects": true }, "DecStackTop": { "Desc": [ "Decrease stack top-pointer." ], "X87": true, "HasSideEffects": true }, "InvalidateStack u8:$StackLocation": { "Desc": [ "Marks the value in TOP+$StackLocation as empty / invalid 0b11.", "If the StackLocation is 0xff, we invalidate all locations." ], "X87": true, "HasSideEffects": true }, "PushStack FPR:$X80Src, FPR:$OriginalValue, OpSize:$LoadSize": { "Desc": [ "Pushes the provided X80Src source on to the x87 stack.", "Tracks OriginalValue as the original value of X80Src. OriginalValue can be Invalid() in which case no tracking is done.", "Opsize is 128bit for F80 values, 64-bit for low precision.", "LoadSize the original load size, i.e. of size of OriginalValue.", "Float: 80-bit, 64-bit, 32-bit" ], "HasSideEffects": true, "X87": true }, "CopyPushStack u8:$StackLocation": { "Desc": [ "Pushes an element already on the stack onto the top." ], "HasSideEffects": true, "X87": true }, "StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit" ], "HasSideEffects": true, "X87": true }, "StoreStackToStack u8:$StackLocation": { "Desc": [ "Takes the top value off the x87 stack and stores it to stack location TOP+StackLocation", "Float: 80-bit, 64-bit, or 32-bit", "Int: 64-bit, 32-bit, 16-bit" ], "HasSideEffects": true, "X87": true }, "PopStackDestroy": { "Desc": [ "Pops the top value off the stack but doesn't save it anywhere." ], "HasSideEffects": true, "X87": true }, "FPR = ReadStackValue u8:$StackLocation": { "Desc": [ "Reads a value off the stack at the offset" ], "DestSize": "OpSize::i128Bit", "X87": true }, "GPR = StackValidTag u8:$StackLocation": { "Desc": [ "Returns 1 if the value in location TOP+$StackLocation is valid, 0 otherwise." ], "DestSize": "OpSize::i32Bit", "X87": true }, "F80AddStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Adds two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, "X87": true }, "F80AddValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Adds a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, "X87": true }, "FPR = F80Add FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Subtracts the value in stack location TOP+$SrcStack2 from the value in stack location TOP+$SrcStack1.", "The result is stored in stack location TOP+$DstStack." ], "HasSideEffects": true, "X87": true }, "F80SubValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Subtracts the value $X80Src from the value in stack location TOP+$SrcStack.", "The result is stored in stack location TOP." ], "HasSideEffects": true, "X87": true }, "F80SubRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ "Subtracts the value in stack location TOP+$SrcStack from the value $X80Src.", "The result is stored in stack location TOP." ], "HasSideEffects": true, "X87": true }, "FPR = F80Sub FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ "Subtracts the value in $X80Src1 from the value in $X80Src2.", "The result is returned.", "`FPR = X80Src2 - X80Src1`" ], "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80MulStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Multiplies two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, "X87": true }, "F80MulValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Multiplies a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, "X87": true }, "FPR = F80Mul FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Divides the value in stack location TOP+$SrcStack1 by the value in stack location TOP+$SrcStack2.", "The result is stored in stack location TOP+$DstStack.", "`FPR|Stack[TOP+DstStack] = Stack[TOP+SrcStack1] / Stack[TOP+SrcStack2]`" ], "HasSideEffects": true, "X87": true }, "F80DivValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Divides the value in stack location TOP+$SrcStack by the value $X80Src.", "The result is stored in stack location TOP and returned.", "`FPR|Stack[TOP] = Stack[TOP+SrcStack] / X80Src`" ], "HasSideEffects": true, "X87": true }, "F80DivRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ "Divides the value X80Src by the value in stack location TOP+$SrcStack.", "The result is stored in stack location TOP.", "`FPR|Stack[TOP] = X80Src / Stack[TOP+SrcStack]`" ], "HasSideEffects": true, "X87": true }, "FPR = F80Div FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ "Divides the value in $X80Src1 by the value in $X80Src2.", "The result is returned.", "`FPR = X80Src1 / X80Src2`" ], "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80StackXchange u8:$SrcStack": { "Desc": [ "Exchanges the value at the top of the stack with the value at TOP+$SrcStack." ], "X87": true, "HasSideEffects": true }, "FPR = F80StackChangeSign": { "Desc": [ "Complements the sign bit of the value at the top of the stack.", "Returns the new value at the top of the stack." ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", "X87": true }, "FPR = F80StackAbs": { "Desc": [ "Clears the sign bit of the value at the top of the stack.", "Returns the new value at the top of the stack." ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", "X87": true }, "F80PTANStack": { "Desc": [ "Computes the approximate tangent of the source operand in register ST(0), stores the result in ST(0), and pushes a 1.0 onto the FPU register stack." ], "X87": true, "HasSideEffects": true }, "FPR = F80ATANStack": { "Desc": [ "Computes arctan(st1/st0) and stores it in st0. Then pops the stack." ], "DestSize": "OpSize::i128Bit", "X87": true, "HasSideEffects": true }, "FPR = F80ATAN FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80FPREMStack": { "X87": true, "HasSideEffects": true }, "FPR = F80FPREM FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80FPREM1Stack": { "X87": true, "HasSideEffects": true }, "FPR = F80FPREM1 FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80SCALEStack": { "X87": true, "HasSideEffects": true }, "FPR = F80SCALE FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80CVT OpSize:#Size, FPR:$X80Src": { "DestSize": "Size", "JITDispatch": false }, "GPR = F80CVTInt OpSize:#Size, FPR:$X80Src, i1:$Truncate": { "DestSize": "Size", "JITDispatch": false }, "FPR = F80CVTTo FPR:$X80Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80CVTToInt GPR:$Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80RoundStack": { "Desc": [ "Replaces the value at the top of the stack with its nearest integral value." ], "X87": true, "HasSideEffects": true }, "FPR = F80Round FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80F2XM1Stack": { "X87": true, "HasSideEffects": true }, "FPR = F80F2XM1 FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80TAN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80SINStack": { "X87": true, "HasSideEffects": true }, "FPR = F80SIN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80COSStack": { "X87": true, "HasSideEffects": true }, "FPR = F80COS FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR:$Sin, FPR:$Cos = F80SINCOS FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "HasSideEffects": true, "JITDispatch": false }, "F80SINCOSStack": { "X87": true, "HasSideEffects": true }, "F80SQRTStack": { "X87": true, "HasSideEffects": true }, "FPR = F80SQRT FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80XTRACT_EXP FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80XTRACT_SIG FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "GPR = F80StackTest u8:$SrcStack": { "Desc": [ "Does comparison between value in stack at TOP + SrcStack" ], "DestSize": "OpSize::i32Bit", "X87": true }, "GPR = F80CmpStack u8:$SrcStack": { "Desc": [ "Does a scalar unordered compare between the value at the top of the stack and the value in stack position TOP+$SrcStack and stores the flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", "X87": true }, "GPR = F80CmpValue FPR:$X80Src": { "Desc": [ "Does a scalar unordered compare between the value at the top of the stack and $X80Src and stores the asked for flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", "HasSideEffects": true, "X87": true }, "GPR = F80Cmp FPR:$X80Src1, FPR:$X80Src2": { "Desc": ["Does a scalar unordered compare and stores the flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", "JITDispatch": false }, "FPR = F80BCDLoad FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80BCDStore FPR:$X80Src": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "FPR = F80FYL2XStack": { "Desc": [ "Computes ST1 * log2(ST0)", "Stores the result in ST1, and pops the top of the stack.", "Returns the new value at the top of the stack, i.e. the result of the operation." ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", "X87": true }, "FPR = F80FYL2X FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", "JITDispatch": false }, "F80VBSLStack OpSize:#RegisterSize, FPR:$VectorMask, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Does a vector bitwise select.", "If the bit in the field is 1 then the corresponding bit is pulled from VectorTrue", "If the bit in the field is 0 then the corresponding bit is pulled from VectorFalse", "Writes the result to the top of the stack." ], "X87": true, "HasSideEffects": true } }, "Backend": { "Last": { "HasSideEffects": true } } } } ================================================ FILE: FEXCore/Source/Interface/IR/IRDumper.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ meta: ir|dumper ~ IR -> Text tags: ir|dumper $end_info$ */ #include "Interface/IR/IntrusiveIRList.h" #include "Interface/IR/RegisterAllocationData.h" #include #include #include #include #include #include #include #include #include namespace FEXCore::IR { #define IROP_GETNAME_IMPL #define IROP_GETRAARGS_IMPL #define IROP_REG_CLASSES_IMPL #define IROP_HASSIDEEFFECTS_IMPL #define IROP_SIZES_IMPL #define IROP_GETHASDEST_IMPL #include static void PrintArg(fextl::stringstream* out, const IRListView*, const SHA256Sum& Arg) { *out << fextl::fmt::format("sha256:{:02x}", fmt::join(Arg.data, "")); } static void PrintArg(fextl::stringstream* out, const IRListView*, uint64_t Arg) { *out << fextl::fmt::format("#{:#x}", Arg); } static void PrintArg(fextl::stringstream* out, const IRListView*, CondClass Arg) { if (Arg == CondClass::AL) { *out << "ALWAYS"; return; } static constexpr std::array CondNames = {"EQ", "NEQ", "UGE", "ULT", "MI", "PL", "VS", "VC", "UGT", "ULE", "SGE", "SLT", "SGT", "SLE", "TSTZ", "TSTNZ", "FLU", "FGE", "FLEU", "FGT", "FU", "FNU"}; *out << CondNames[FEXCore::ToUnderlying(Arg)]; } static void PrintArg(fextl::stringstream* out, const IRListView*, MemOffsetType Arg) { static constexpr std::array Names = { "SXTX", "UXTW", "SXTW", }; *out << Names[FEXCore::ToUnderlying(Arg)]; } static void PrintArg(fextl::stringstream* out, const IRListView*, RegClass Arg) { *out << [Arg] { switch (Arg) { case RegClass::Invalid: return "Invalid"; case RegClass::GPR: return "GPR"; case RegClass::GPRFixed: return "GPRFixed"; case RegClass::FPR: return "FPR"; case RegClass::FPRFixed: return "FPRFixed"; case RegClass::Complex: return "Complex"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView* IR, OrderedNodeWrapper Arg) { if (Arg.IsImmediate()) { auto PhyReg = PhysicalRegister(Arg); switch (PhyReg.AsRegClass()) { case RegClass::GPR: *out << "r"; break; case RegClass::GPRFixed: *out << "R"; break; case RegClass::FPR: *out << "v"; break; case RegClass::FPRFixed: *out << "V"; break; case RegClass::Complex: *out << "c"; break; case RegClass::Invalid: *out << "invalid"; break; default: *out << "unknown"; break; } if (PhyReg.AsRegClass() != RegClass::Invalid) { *out << std::dec << uint32_t(PhyReg.Reg); } return; } auto [CodeNode, IROp] = IR->at(Arg)(); const auto ArgID = Arg.ID(); if (ArgID.IsInvalid()) { *out << "%Invalid"; } else { *out << "%" << std::dec << ArgID; } if (GetHasDest(IROp->Op)) { auto ElementSize = IROp->ElementSize; uint32_t NumElements = 0; if (IROp->ElementSize == OpSize::iUnsized) { ElementSize = IROp->Size; } if (ElementSize != OpSize::iUnsized) { NumElements = IR::NumElements(IROp->Size, ElementSize); } *out << " i" << std::dec << IR::OpSizeAsBits(ElementSize); if (NumElements > 1) { *out << "v" << std::dec << NumElements; } } } static void PrintArg(fextl::stringstream* out, const IRListView*, FenceType Arg) { *out << [Arg] { switch (Arg) { case FenceType::Load: return "Loads"; case FenceType::Store: return "Stores"; case FenceType::LoadStore: return "LoadStores"; case FenceType::Inst: return "Instruction"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, RoundMode Arg) { *out << [Arg] { switch (Arg) { case RoundMode::Nearest: return "Nearest"; case RoundMode::NegInfinity: return "-Inf"; case RoundMode::PosInfinity: return "+Inf"; case RoundMode::TowardsZero: return "Towards Zero"; case RoundMode::Host: return "Host"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, ConstPad Arg) { *out << [Arg] { switch (Arg) { case ConstPad::NoPad: return "NoPad"; case ConstPad::DoPad: return "DoPad"; case ConstPad::AutoPad: return "AutoPad"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, NamedVectorConstant Arg) { *out << [Arg] { // clang-format off switch (Arg) { case NamedVectorConstant::NAMED_VECTOR_INCREMENTAL_U16_INDEX: return "u16_incremental_index"; case NamedVectorConstant::NAMED_VECTOR_INCREMENTAL_U16_INDEX_UPPER: return "u16_incremental_index_upper"; case NamedVectorConstant::NAMED_VECTOR_PADDSUBPS_INVERT: return "addsubps_invert"; case NamedVectorConstant::NAMED_VECTOR_PADDSUBPS_INVERT_UPPER: return "addsubps_invert_upper"; case NamedVectorConstant::NAMED_VECTOR_PADDSUBPD_INVERT: return "addsubpd_invert"; case NamedVectorConstant::NAMED_VECTOR_PADDSUBPD_INVERT_UPPER: return "addsubpd_invert_upper"; case NamedVectorConstant::NAMED_VECTOR_PSUBADDPS_INVERT: return "subaddps_invert"; case NamedVectorConstant::NAMED_VECTOR_PSUBADDPS_INVERT_UPPER: return "subaddps_invert_upper"; case NamedVectorConstant::NAMED_VECTOR_PSUBADDPD_INVERT: return "subaddpd_invert"; case NamedVectorConstant::NAMED_VECTOR_PSUBADDPD_INVERT_UPPER: return "subaddpd_invert_upper"; case NamedVectorConstant::NAMED_VECTOR_MOVMSKPS_SHIFT: return "movmskps_shift"; case NamedVectorConstant::NAMED_VECTOR_AESKEYGENASSIST_SWIZZLE: return "aeskeygenassist_swizzle"; case NamedVectorConstant::NAMED_VECTOR_BLENDPS_0110B: return "blendps_0110b"; case NamedVectorConstant::NAMED_VECTOR_BLENDPS_0111B: return "blendps_0111b"; case NamedVectorConstant::NAMED_VECTOR_BLENDPS_1001B: return "blendps_1001b"; case NamedVectorConstant::NAMED_VECTOR_BLENDPS_1011B: return "blendps_1011b"; case NamedVectorConstant::NAMED_VECTOR_BLENDPS_1101B: return "blendps_1101b"; case NamedVectorConstant::NAMED_VECTOR_BLENDPS_1110B: return "blendps_1110b"; case NamedVectorConstant::NAMED_VECTOR_MOVMASKB: return "movmaskb"; case NamedVectorConstant::NAMED_VECTOR_MOVMASKB_UPPER: return "movmaskb_upper"; case NamedVectorConstant::NAMED_VECTOR_ZERO: return "vectorzero"; case NamedVectorConstant::NAMED_VECTOR_X87_ONE: return "x87_1_0"; case NamedVectorConstant::NAMED_VECTOR_X87_LOG2_10: return "x87_log2_10"; case NamedVectorConstant::NAMED_VECTOR_X87_LOG2_E: return "x87_log2_e"; case NamedVectorConstant::NAMED_VECTOR_X87_PI: return "x87_pi"; case NamedVectorConstant::NAMED_VECTOR_X87_LOG10_2: return "x87_log10_2"; case NamedVectorConstant::NAMED_VECTOR_X87_LOG_2: return "x87_log2"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F32_I32: return "cvtmax_f32_i32"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F32_I32_UPPER: return "cvtmax_f32_i32_upper"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F32_I64: return "cvtmax_f32_i64"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F64_I32: return "cvtmax_f64_i32"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F64_I32_UPPER: return "cvtmax_f64_i32_upper"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F64_I64: return "cvtmax_f64_i64"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_I32: return "cvtmax_i32"; case NamedVectorConstant::NAMED_VECTOR_CVTMAX_I64: return "cvtmax_i64"; case NamedVectorConstant::NAMED_VECTOR_F80_SIGN_MASK: return "f80_sign_mask"; case NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K0: return "sha1rnds_k0"; case NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K1: return "sha1rnds_k1"; case NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K2: return "sha1rnds_k2"; case NamedVectorConstant::NAMED_VECTOR_SHA1RNDS_K3: return "sha1rnds_k3"; case NamedVectorConstant::NAMED_VECTOR_MAX: return ""; } return ""; // clang-format on }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, IndexNamedVectorConstant Arg) { *out << [Arg] { // clang-format off switch (Arg) { case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFLW: return "pshuflw"; case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFHW: return "pshufhw"; case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD: return "pshufd"; case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_SHUFPS: return "shufps"; case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPS_MASK: return "dpps_mask"; case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_DPPD_MASK: return "dppd_mask"; case IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PBLENDW: return "pblendw"; case INDEXED_NAMED_VECTOR_MAX: return ""; } return ""; // clang-format on }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, OpSize Arg) { *out << [Arg] { switch (Arg) { case OpSize::iUnsized: return "Unsized"; case OpSize::i8Bit: return "i8"; case OpSize::i16Bit: return "i16"; case OpSize::i32Bit: return "i32"; case OpSize::i64Bit: return "i64"; case OpSize::f80Bit: return "f80"; case OpSize::i128Bit: return "i128"; case OpSize::i256Bit: return "i256"; case OpSize::iInvalid: return "Invalid"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, FloatCompareOp Arg) { *out << [Arg] { switch (Arg) { case FloatCompareOp::EQ: return "FEQ"; case FloatCompareOp::LT: return "FLT"; case FloatCompareOp::LE: return "FLE"; case FloatCompareOp::UNO: return "UNO"; case FloatCompareOp::NEQ: return "NEQ"; case FloatCompareOp::ORD: return "ORD"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, FEXCore::IR::BreakDefinition Arg) { *out << "{" << Arg.ErrorRegister << "."; *out << static_cast(Arg.Signal) << "."; *out << static_cast(Arg.TrapNumber) << "."; *out << static_cast(Arg.si_code) << "}"; } static void PrintArg(fextl::stringstream* out, const IRListView*, ShiftType Arg) { *out << [Arg] { switch (Arg) { case ShiftType::LSL: return "LSL"; case ShiftType::LSR: return "LSR"; case ShiftType::ASR: return "ASR"; case ShiftType::ROR: return "ROR"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, BranchHint Arg) { *out << [Arg] { switch (Arg) { case BranchHint::None: return "None"; case BranchHint::Call: return "Call"; case BranchHint::Return: return "Return"; case BranchHint::CheckTF: return "CheckTF"; } return ""; }(); } static void PrintArg(fextl::stringstream* out, const IRListView*, const std::array& Arg) { *out << fextl::fmt::format("{:02x}", fmt::join(Arg, "")); } void Dump(fextl::stringstream* out, const IRListView* IR) { auto HeaderOp = IR->GetHeader(); int8_t CurrentIndent = 0; auto AddIndent = [&out, &CurrentIndent]() { for (uint8_t i = 0; i < CurrentIndent; ++i) { *out << "\t"; } }; ++CurrentIndent; AddIndent(); *out << fextl::fmt::format("(%0) IRHeader %{}, #{:#x}, #{}, #{}\n", HeaderOp->Blocks.ID(), +HeaderOp->OriginalRIP, +HeaderOp->BlockCount, +HeaderOp->NumHostInstructions); for (auto [BlockNode, BlockHeader] : IR->GetBlocks()) { { auto BlockIROp = BlockHeader->C(); AddIndent(); *out << "(%" << IR->GetID(BlockNode) << ") " << "CodeBlock "; *out << "%" << BlockIROp->Begin.ID() << ", "; *out << "%" << BlockIROp->Last.ID() << std::endl; } ++CurrentIndent; for (auto [CodeNode, IROp] : IR->GetCode(BlockNode)) { const auto ID = IR->GetID(CodeNode); const auto Name = FEXCore::IR::GetName(IROp->Op); { AddIndent(); if (GetHasDest(IROp->Op)) { auto ElementSize = IROp->ElementSize; uint8_t NumElements = 0; if (IROp->ElementSize != OpSize::iUnsized) { ElementSize = IROp->Size; } if (ElementSize != OpSize::iUnsized) { NumElements = IR::NumElements(IROp->Size, ElementSize); } *out << "%" << std::dec << ID; auto PhyReg = PhysicalRegister(CodeNode); if (!PhyReg.IsInvalid()) { switch (PhyReg.AsRegClass()) { case RegClass::GPR: *out << "(r"; break; case RegClass::GPRFixed: *out << "(R"; break; case RegClass::FPR: *out << "(v"; break; case RegClass::FPRFixed: *out << "(V"; break; case RegClass::Complex: *out << "(complex"; break; case RegClass::Invalid: *out << "(invalid"; break; default: *out << "(unknown"; break; } if (PhyReg.AsRegClass() != RegClass::Invalid) { *out << std::dec << uint32_t(PhyReg.Reg) << ")"; } else { *out << ")"; } } *out << " i" << std::dec << IR::OpSizeAsBits(ElementSize); if (NumElements > 1) { *out << "v" << std::dec << NumElements; } *out << " = "; } else { auto ElementSize = IROp->ElementSize; if (IROp->ElementSize == OpSize::iUnsized) { ElementSize = IROp->Size; } uint32_t NumElements = 0; if (ElementSize != OpSize::iUnsized) { NumElements = IR::NumElements(IROp->Size, ElementSize); } *out << "(%" << std::dec << ID << ' '; *out << 'i' << std::dec << IR::OpSizeAsBits(ElementSize); if (NumElements > 1) { *out << 'v' << std::dec << NumElements; } *out << ") "; } *out << Name; #define IROP_ARGPRINTER_HELPER #include default: *out << ""; break; } //*out << " (" << std::dec << CodeNode->GetUses() << ")"; *out << "\n"; } } CurrentIndent = std::max(0, CurrentIndent - 1); } } } ================================================ FILE: FEXCore/Source/Interface/IR/IREmitter.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ meta: ir|emitter ~ C++ Functions to generate IR. See IR.json for spec. tags: ir|emitter $end_info$ */ #include "Interface/IR/IREmitter.h" #include #include #include #include #include namespace FEXCore::IR { static bool IsFragmentExit(FEXCore::IR::IROps Op) { switch (Op) { case OP_EXITFUNCTION: case OP_BREAK: return true; default: return false; } } bool IsBlockExit(FEXCore::IR::IROps Op) { switch (Op) { case OP_JUMP: case OP_CONDJUMP: return true; default: return IsFragmentExit(Op); } } RegClass IREmitter::WalkFindRegClass(Ref Node) { auto Class = GetOpRegClass(Node); switch (Class) { case RegClass::GPR: case RegClass::FPR: case RegClass::GPRFixed: case RegClass::FPRFixed: case RegClass::Invalid: return Class; default: break; } // Complex case, needs to be handled on an op by op basis uintptr_t DataBegin = DualListData.DataBegin(); FEXCore::IR::IROp_Header* IROp = Node->Op(DataBegin); switch (IROp->Op) { case IROps::OP_LOADREGISTER: { auto Op = IROp->C(); return Op->Class; break; } case IROps::OP_LOADCONTEXT: { auto Op = IROp->C(); return Op->Class; break; } case IROps::OP_LOADCONTEXTINDEXED: { auto Op = IROp->C(); return Op->Class; break; } case IROps::OP_FILLREGISTER: { auto Op = IROp->C(); return Op->Class; break; } case IROps::OP_LOADMEM: { auto Op = IROp->C(); return Op->Class; break; } case IROps::OP_LOADMEMTSO: { auto Op = IROp->C(); return Op->Class; break; } default: LOGMAN_MSG_A_FMT("Unhandled op type: {} {} in argument class validation", ToUnderlying(IROp->Op), GetOpName(Node)); break; } return RegClass::Invalid; } void IREmitter::ResetWorkingList() { DualListData.Reset(); CodeBlocks.clear(); CurrentWriteCursor = nullptr; // This is necessary since we do "null" pointer checks InvalidNode = reinterpret_cast(DualListData.ListAllocate(sizeof(OrderedNode))); memset(InvalidNode, 0, sizeof(OrderedNode)); CurrentCodeBlock = nullptr; } void IREmitter::ReplaceAllUsesWithRange(Ref Node, Ref NewNode, AllNodesIterator Begin, AllNodesIterator End) { uintptr_t ListBegin = DualListData.ListBegin(); auto NodeId = Node->Wrapped(ListBegin).ID(); while (Begin != End) { auto [RealNode, IROp] = Begin(); const uint8_t NumArgs = IR::GetArgs(IROp->Op); for (uint8_t i = 0; i < NumArgs; ++i) { if (IROp->Args[i].ID() == NodeId) { Node->RemoveUse(); NewNode->AddUse(); IROp->Args[i].NodeOffset = NewNode->Wrapped(ListBegin).NodeOffset; // We can stop searching once all uses of the node are gone. if (Node->NumUses == 0) { return; } } } ++Begin; } } void IREmitter::ReplaceNodeArgument(Ref Node, uint8_t Arg, Ref NewArg) { uintptr_t ListBegin = DualListData.ListBegin(); uintptr_t DataBegin = DualListData.DataBegin(); FEXCore::IR::IROp_Header* IROp = Node->Op(DataBegin); OrderedNodeWrapper OldArgWrapper = IROp->Args[Arg]; Ref OldArg = OldArgWrapper.GetNode(ListBegin); OldArg->RemoveUse(); NewArg->AddUse(); IROp->Args[Arg].NodeOffset = NewArg->Wrapped(ListBegin).NodeOffset; } void IREmitter::RemoveArgUses(Ref Node) { uintptr_t ListBegin = DualListData.ListBegin(); uintptr_t DataBegin = DualListData.DataBegin(); FEXCore::IR::IROp_Header* IROp = Node->Op(DataBegin); const uint8_t NumArgs = IR::GetArgs(IROp->Op); for (uint8_t i = 0; i < NumArgs; ++i) { auto ArgNode = IROp->Args[i].GetNode(ListBegin); ArgNode->RemoveUse(); } } void IREmitter::RemovePostRA(Ref Node) { Node->Unlink(DualListData.ListBegin()); } void IREmitter::Remove(Ref Node) { RemoveArgUses(Node); Node->Unlink(DualListData.ListBegin()); } IREmitter::IRPair IREmitter::CreateNewCodeBlockAfter(Ref insertAfter) { auto OldCursor = GetWriteCursor(); auto CodeNode = CreateCodeNode(); if (insertAfter) { LinkCodeBlocks(insertAfter, CodeNode); } else { LOGMAN_THROW_A_FMT(CurrentCodeBlock != nullptr, "CurrentCodeBlock must not be null here"); // Find last block auto LastBlock = CurrentCodeBlock; while (LastBlock->Header.Next.GetNode(DualListData.ListBegin()) != InvalidNode) { LastBlock = LastBlock->Header.Next.GetNode(DualListData.ListBegin()); } // Append it after the last block LinkCodeBlocks(LastBlock, CodeNode); } SetWriteCursor(OldCursor); return CodeNode; } void IREmitter::SetCurrentCodeBlock(Ref Node) { CurrentCodeBlock = Node; LOGMAN_THROW_A_FMT(Node->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Node wasn't codeblock. It was '{}'", IR::GetName(Node->Op(DualListData.DataBegin())->Op)); SetWriteCursor(Node->Op(DualListData.DataBegin())->CW()->Begin.GetNode(DualListData.ListBegin())); // Constants are pooled only within a single block. NrConstants = 0; } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/IREmitter.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" #include #include #include #include #include #include #include namespace FEXCore::IR { class IREmitter { public: IREmitter(FEXCore::Utils::IntrusivePooledAllocator& ThreadAllocator, bool SupportsTSOImm9) : DualListData {ThreadAllocator, 8 * 1024 * 1024} , SupportsTSOImm9(SupportsTSOImm9) {} virtual ~IREmitter() = default; void ReownOrClaimBuffer() { DualListData.ReownOrClaimBuffer(); // Reset the working list on new buffer. ResetWorkingList(); } void DelayedDisownBuffer() { DualListData.DelayedDisownBuffer(); } IRListView ViewIR() { return IRListView(&DualListData); } /** * @name IR allocation routines * * @{ */ RegClass WalkFindRegClass(Ref Node); // These inlining helpers are used by IRDefines.inc so define first. Ref InlineMem(OpSize Size, Ref Offset, MemOffsetType OffsetType, uint8_t& OffsetScale, bool TSO = false) { uint64_t Imm {}; if (OffsetType != MemOffsetType::SXTX || !IsValueConstant(WrapNode(Offset), &Imm)) { return Offset; } // The immediate may be scaled in the IR, we need to correct for that. Imm *= OffsetScale; // Signed immediate unscaled 9-bit range for both regular and LRCPC2 ops. bool IsSIMM9 = ((int64_t)Imm >= -256) && ((int64_t)Imm <= 255); IsSIMM9 &= (SupportsTSOImm9 || !TSO); // Extended offsets for regular loadstore only. LOGMAN_THROW_A_FMT(Size >= IR::OpSize::i8Bit && Size <= IR::OpSize::i256Bit, "Must be sized"); bool IsExtended = (Imm & (IR::OpSizeToSize(Size) - 1)) == 0 && Imm / IR::OpSizeToSize(Size) <= 4095; IsExtended &= !TSO; if (IsSIMM9 || IsExtended) { OffsetScale = 1; return _InlineConstant(Imm); } else { return Offset; } } #define DEF_INLINE(Type, Variable, Filter) \ Ref Inline##Type(OpSize Size, Ref Source) { \ uint64_t Variable; \ if (IsValueConstant(WrapNode(Source), &Variable) && (Filter)) { \ return _InlineConstant(Variable); \ } else { \ return Source; \ } \ } DEF_INLINE(Any, _, true) DEF_INLINE(Zero, X, X == 0) DEF_INLINE(AddSub, X, ARMEmitter::IsImmAddSub(X)) DEF_INLINE(LargeAddSub, X, ARMEmitter::IsImmAddSub(X) && Size >= OpSize::i32Bit); DEF_INLINE(Logical, X, ARMEmitter::Emitter::IsImmLogical(X, std::max((int)IR::OpSizeAsBits(Size), 32))); Ref InlineSubtractZero(OpSize Size, Ref Src1, Ref Src2) { // Only inline a zero if we won't inline the other source. return IsValueConstant(WrapNode(Src2)) ? Src1 : InlineZero(Size, Src1); } #undef DEF_INLINE // These handlers add cost to the constructor and destructor // If it becomes an issue then blow them away // GCC also generates some pretty atrocious code around these // Use Clang! #define IROP_ALLOCATE_HELPERS #define IROP_DISPATCH_HELPERS #include IRPair _Jump() { return _Jump(InvalidNode); } IRPair _CondJump(Ref ssa0, CondClass cond = CondClass::NEQ) { return _CondJump(ssa0, _Constant(0), InvalidNode, InvalidNode, cond, GetOpSize(ssa0)); } IRPair _CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClass cond = CondClass::NEQ) { return _CondJump(ssa0, _Constant(0), ssa1, ssa2, cond, GetOpSize(ssa0)); } IRPair _LoadContextGPR(OpSize ByteSize, uint32_t Offset) { return _LoadContext(ByteSize, RegClass::GPR, Offset); } IRPair _LoadContextFPR(OpSize ByteSize, uint32_t Offset) { return _LoadContext(ByteSize, RegClass::FPR, Offset); } IRPair _StoreContextGPR(OpSize ByteSize, Ref Value, uint32_t Offset) { return _StoreContext(ByteSize, RegClass::GPR, Value, Offset); } IRPair _StoreContextFPR(OpSize ByteSize, Ref Value, uint32_t Offset) { return _StoreContext(ByteSize, RegClass::FPR, Value, Offset); } IRPair _LoadContextGPRIndexed(Ref Index, OpSize ByteSize, uint32_t BaseOffset, uint32_t Stride) { return _LoadContextIndexed(Index, ByteSize, BaseOffset, Stride, RegClass::GPR); } IRPair _LoadContextFPRIndexed(Ref Index, OpSize ByteSize, uint32_t BaseOffset, uint32_t Stride) { return _LoadContextIndexed(Index, ByteSize, BaseOffset, Stride, RegClass::FPR); } IRPair _StoreContextGPRIndexed(Ref Value, Ref Index, OpSize ByteSize, uint32_t BaseOffset, uint32_t Stride) { return _StoreContextIndexed(Value, Index, ByteSize, BaseOffset, Stride, RegClass::GPR); } IRPair _StoreContextFPRIndexed(Ref Value, Ref Index, OpSize ByteSize, uint32_t BaseOffset, uint32_t Stride) { return _StoreContextIndexed(Value, Index, ByteSize, BaseOffset, Stride, RegClass::FPR); } IRPair _LoadMem(RegClass Class, OpSize Size, Ref ssa0, OpSize Align = OpSize::i8Bit) { return _LoadMem(Class, Size, ssa0, Invalid(), Align, MemOffsetType::SXTX, 1); } IRPair _LoadMemGPR(OpSize Size, Ref ssa0, OpSize Align = OpSize::i8Bit) { return _LoadMem(RegClass::GPR, Size, ssa0, Invalid(), Align, MemOffsetType::SXTX, 1); } IRPair _LoadMemGPR(OpSize Size, Ref Addr, Ref Offset, OpSize Align, MemOffsetType OffsetType, uint8_t OffsetScale) { return _LoadMem(RegClass::GPR, Size, Addr, Offset, Align, OffsetType, OffsetScale); } IRPair _LoadMemFPR(OpSize Size, Ref ssa0, OpSize Align = OpSize::i8Bit) { return _LoadMem(RegClass::FPR, Size, ssa0, Invalid(), Align, MemOffsetType::SXTX, 1); } IRPair _LoadMemFPR(OpSize Size, Ref Addr, Ref Offset, OpSize Align, MemOffsetType OffsetType, uint8_t OffsetScale) { return _LoadMem(RegClass::FPR, Size, Addr, Offset, Align, OffsetType, OffsetScale); } IRPair _StoreMem(RegClass Class, OpSize Size, Ref Addr, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMem(Class, Size, Value, Addr, Invalid(), Align, MemOffsetType::SXTX, 1); } IRPair _StoreMemGPR(OpSize Size, Ref Addr, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMem(RegClass::GPR, Size, Value, Addr, Invalid(), Align, MemOffsetType::SXTX, 1); } IRPair _StoreMemGPR(OpSize Size, Ref Value, Ref Addr, Ref Offset, OpSize Align, MemOffsetType OffsetType, uint8_t OffsetScale) { return _StoreMem(RegClass::GPR, Size, Value, Addr, Offset, Align, OffsetType, OffsetScale); } IRPair _StoreMemFPR(OpSize Size, Ref Addr, Ref Value, OpSize Align = OpSize::i8Bit) { return _StoreMem(RegClass::FPR, Size, Value, Addr, Invalid(), Align, MemOffsetType::SXTX, 1); } IRPair _StoreMemFPR(OpSize Size, Ref Value, Ref Addr, Ref Offset, OpSize Align, MemOffsetType OffsetType, uint8_t OffsetScale) { return _StoreMem(RegClass::FPR, Size, Value, Addr, Offset, Align, OffsetType, OffsetScale); } IRPair _StoreMemPairGPR(OpSize Size, Ref Value1, Ref Value2, Ref Addr, uint32_t Offset) { return _StoreMemPair(RegClass::GPR, Size, Value1, Value2, Addr, Offset); } IRPair _StoreMemPairFPR(OpSize Size, Ref Value1, Ref Value2, Ref Addr, uint32_t Offset) { return _StoreMemPair(RegClass::FPR, Size, Value1, Value2, Addr, Offset); } IRPair Select01(FEXCore::IR::OpSize CompareSize, CondClass Cond, OrderedNode* Cmp1, OrderedNode* Cmp2) { return _Select(OpSize::i64Bit, CompareSize, Cond, Cmp1, Cmp2, _InlineConstant(1), _InlineConstant(0)); } IRPair To01(FEXCore::IR::OpSize CompareSize, OrderedNode* Cmp1) { return Select01(CompareSize, CondClass::NEQ, Cmp1, Constant(0)); } IRPair _NZCVSelect01(CondClass Cond) { return _NZCVSelect(OpSize::i64Bit, Cond, _InlineConstant(1), _InlineConstant(0)); } Ref Addsub(IR::OpSize Size, IROps Op, IROps NegatedOp, Ref Src1, uint64_t Src2) { // Sign-extend the constant if (Size == OpSize::i32Bit) { Src2 = (int64_t)(int32_t)Src2; } // Negative constants need to be negated to inline. if (Src2 & (1ull << 63) && ARMEmitter::IsImmAddSub(-Src2)) { Op = NegatedOp; Src2 = -Src2; } auto Dest = _Add(Size, Src1, Constant(Src2)); Dest.first->Header.Op = Op; return Dest; } Ref Add(IR::OpSize Size, Ref Src1, uint64_t Src2) { return Addsub(Size, OP_ADD, OP_SUB, Src1, Src2); } Ref Sub(IR::OpSize Size, Ref Src1, uint64_t Src2) { return Addsub(Size, OP_SUB, OP_ADD, Src1, Src2); } Ref AddWithFlags(IR::OpSize Size, Ref Src1, uint64_t Src2) { return Addsub(Size, OP_ADDWITHFLAGS, OP_SUBWITHFLAGS, Src1, Src2); } Ref SubWithFlags(IR::OpSize Size, Ref Src1, uint64_t Src2) { return Addsub(Size, OP_SUBWITHFLAGS, OP_ADDWITHFLAGS, Src1, Src2); } #define DEF_ADDSUB(Op) \ Ref Op(IR::OpSize Size, Ref Src1, Ref Src2) { \ uint64_t Constant; \ if (IsValueConstant(WrapNode(Src2), &Constant)) { \ return Op(Size, Src1, Constant); \ } else { \ return _##Op(Size, Src1, Src2); \ } \ } DEF_ADDSUB(Add) DEF_ADDSUB(Sub) DEF_ADDSUB(AddWithFlags) DEF_ADDSUB(SubWithFlags) struct ConstantData { int64_t Value; ConstPad Pad; int32_t MaxBytes; [[nodiscard]] auto operator<=>(const ConstantData&) const noexcept = default; }; ConstantData Constants[32]; Ref ConstantRefs[32]; uint32_t NrConstants; Ref Constant(int64_t Value, ConstPad Pad = IR::ConstPad::NoPad, int32_t MaxBytes = 0) { const ConstantData Data { .Value = Value, .Pad = Pad, .MaxBytes = MaxBytes, }; // Search for the constant in the pool. for (unsigned i = 0; i < std::min(NrConstants, 32u); ++i) { if (Constants[i] == Data) { return ConstantRefs[i]; } } // Otherwise, materialize a fresh constant and pool it. Ref R = _Constant(Value, Pad, MaxBytes); unsigned i = (NrConstants++) & 31; Constants[i] = Data; ConstantRefs[i] = R; return R; } Ref Invalid() { return InvalidNode; } void SetJumpTarget(IR::IROp_Jump* Op, Ref Target) { LOGMAN_THROW_A_FMT(Target->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Tried setting Jump target to %{} {}", Target->Wrapped(DualListData.ListBegin()).ID(), IR::GetName(Target->Op(DualListData.DataBegin())->Op)); Op->Header.Args[0].NodeOffset = Target->Wrapped(DualListData.ListBegin()).NodeOffset; } void SetTrueJumpTarget(IR::IROp_CondJump* Op, Ref Target) { LOGMAN_THROW_A_FMT(Target->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Tried setting CondJump target to %{} {}", Target->Wrapped(DualListData.ListBegin()).ID(), IR::GetName(Target->Op(DualListData.DataBegin())->Op)); Op->TrueBlock.NodeOffset = Target->Wrapped(DualListData.ListBegin()).NodeOffset; } void SetFalseJumpTarget(IR::IROp_CondJump* Op, Ref Target) { LOGMAN_THROW_A_FMT(Target->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Tried setting CondJump target to %{} {}", Target->Wrapped(DualListData.ListBegin()).ID(), IR::GetName(Target->Op(DualListData.DataBegin())->Op)); Op->FalseBlock.NodeOffset = Target->Wrapped(DualListData.ListBegin()).NodeOffset; } void SetJumpTarget(IRPair Op, Ref Target) { LOGMAN_THROW_A_FMT(Target->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Tried setting Jump target to %{} {}", Target->Wrapped(DualListData.ListBegin()).ID(), IR::GetName(Target->Op(DualListData.DataBegin())->Op)); Op.first->Header.Args[0].NodeOffset = Target->Wrapped(DualListData.ListBegin()).NodeOffset; } void SetTrueJumpTarget(IRPair Op, Ref Target) { LOGMAN_THROW_A_FMT(Target->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Tried setting CondJump target to %{} {}", Target->Wrapped(DualListData.ListBegin()).ID(), IR::GetName(Target->Op(DualListData.DataBegin())->Op)); Op.first->TrueBlock.NodeOffset = Target->Wrapped(DualListData.ListBegin()).NodeOffset; } void SetFalseJumpTarget(IRPair Op, Ref Target) { LOGMAN_THROW_A_FMT(Target->Op(DualListData.DataBegin())->Op == OP_CODEBLOCK, "Tried setting CondJump target to %{} {}", Target->Wrapped(DualListData.ListBegin()).ID(), IR::GetName(Target->Op(DualListData.DataBegin())->Op)); Op.first->FalseBlock.NodeOffset = Target->Wrapped(DualListData.ListBegin()).NodeOffset; } /** @} */ RegClass WalkFindRegClass(OrderedNodeWrapper ssa) { Ref RealNode = ssa.GetNode(DualListData.ListBegin()); return WalkFindRegClass(RealNode); } bool IsValueConstant(OrderedNodeWrapper ssa, uint64_t* Constant = nullptr) { Ref RealNode = ssa.GetNode(DualListData.ListBegin()); FEXCore::IR::IROp_Header* IROp = RealNode->Op(DualListData.DataBegin()); if (IROp->Op == OP_CONSTANT) { auto Op = IROp->C(); if (Constant) { *Constant = Op->Constant; } return true; } return false; } bool IsValueInlineConstant(OrderedNodeWrapper ssa) { Ref RealNode = ssa.GetNode(DualListData.ListBegin()); FEXCore::IR::IROp_Header* IROp = RealNode->Op(DualListData.DataBegin()); if (IROp->Op == OP_INLINECONSTANT) { return true; } return false; } FEXCore::IR::IROp_Header* GetOpHeader(OrderedNodeWrapper ssa) { Ref RealNode = ssa.GetNode(DualListData.ListBegin()); return RealNode->Op(DualListData.DataBegin()); } Ref UnwrapNode(OrderedNodeWrapper ssa) { return ssa.GetNode(DualListData.ListBegin()); } OrderedNodeWrapper WrapNode(Ref node) { return node->Wrapped(DualListData.ListBegin()); } NodeIterator GetIterator(OrderedNodeWrapper wrapper) { return NodeIterator(DualListData.ListBegin(), DualListData.DataBegin(), wrapper); } void ReplaceAllUsesWithRange(Ref Node, Ref NewNode, AllNodesIterator Begin, AllNodesIterator End); void ReplaceUsesWithAfter(Ref Node, Ref NewNode, AllNodesIterator After) { ++After; ReplaceAllUsesWithRange(Node, NewNode, After, AllNodesIterator(DualListData.ListBegin(), DualListData.DataBegin())); } void ReplaceUsesWithAfter(Ref Node, Ref NewNode, Ref After) { auto Wrapped = After->Wrapped(DualListData.ListBegin()); AllNodesIterator It = AllNodesIterator(DualListData.ListBegin(), DualListData.DataBegin(), Wrapped); ReplaceUsesWithAfter(Node, NewNode, It); } void ReplaceNodeArgument(Ref Node, uint8_t Arg, Ref NewArg); void Remove(Ref Node); void RemovePostRA(Ref Node); void CopyData(const IREmitter& rhs) { LOGMAN_THROW_A_FMT(rhs.DualListData.DataBackingSize() <= DualListData.DataBackingSize(), "Trying to take ownership of data that is too " "large"); LOGMAN_THROW_A_FMT(rhs.DualListData.ListBackingSize() <= DualListData.ListBackingSize(), "Trying to take ownership of data that is too " "large"); DualListData.CopyData(rhs.DualListData); InvalidNode = rhs.InvalidNode->Wrapped(rhs.DualListData.ListBegin()).GetNode(DualListData.ListBegin()); CurrentWriteCursor = rhs.CurrentWriteCursor; CodeBlocks = rhs.CodeBlocks; for (auto& CodeBlock : CodeBlocks) { CodeBlock = CodeBlock->Wrapped(rhs.DualListData.ListBegin()).GetNode(DualListData.ListBegin()); } } void SetWriteCursor(Ref Node) { CurrentWriteCursor = Node; } // Set cursor to write before Node void SetWriteCursorBefore(Ref Node) { auto IR = ViewIR(); auto Before = IR.at(Node); --Before; SetWriteCursor((*Before).Node); } Ref GetWriteCursor() { return CurrentWriteCursor; } Ref GetCurrentBlock() { return CurrentCodeBlock; } /** * @brief This creates an orphaned code node * The IROp backing is in the correct list but the OrderedNode lives outside of the list * * XXX: This is because we don't want code blocks to interleave with current instruction IR ops currently * We can change this behaviour once we remove the old BeginBlock/EndBlock types * * @return OrderedNode */ IRPair CreateCodeNode(bool EntryPoint = false, uint32_t GuestEntryOffset = 0) { SetWriteCursor(nullptr); // Orphan from any previous nodes auto ID = ViewIR().GetHeader()->BlockCount++; auto CodeNode = _CodeBlock(InvalidNode, InvalidNode, ID, EntryPoint, GuestEntryOffset); CodeBlocks.emplace_back(CodeNode); SetWriteCursor(nullptr); // Orphan from any future nodes auto Begin = _BeginBlock(CodeNode); CodeNode.first->Begin = Begin.Node->Wrapped(DualListData.ListBegin()); auto EndBlock = _EndBlock(CodeNode); CodeNode.first->Last = EndBlock.Node->Wrapped(DualListData.ListBegin()); return CodeNode; } /** * @name Links codeblocks together * Codeblocks are singly linked so we need to walk the list forward if the linked block isn't isn't the last * * eq. * CodeNode->Next -> Next * to * CodeNode->Next -> New -> Next * * @{ */ /** @} */ void LinkCodeBlocks(Ref CodeNode, Ref Next) { [[maybe_unused]] auto CurrentIROp = CodeNode->Op(DualListData.DataBegin())->CW(); #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED LOGMAN_THROW_A_FMT(CurrentIROp->Header.Op == IROps::OP_CODEBLOCK, "Invalid"); #endif CodeNode->append(DualListData.ListBegin(), Next); } IRPair CreateNewCodeBlockAtEnd() { return CreateNewCodeBlockAfter(nullptr); } IRPair CreateNewCodeBlockAfter(Ref insertAfter); void SetCurrentCodeBlock(Ref Node); protected: void RemoveArgUses(Ref Node); Ref CreateNode(IROp_Header* Op) { uintptr_t ListBegin = DualListData.ListBegin(); size_t Size = sizeof(OrderedNode); void* Ptr = DualListData.ListAllocate(Size); Ref Node = new (Ptr) OrderedNode(); Node->Header.Value.SetOffset(DualListData.DataBegin(), reinterpret_cast(Op)); if (CurrentWriteCursor) { CurrentWriteCursor->append(ListBegin, Node); } CurrentWriteCursor = Node; return Node; } Ref GetNode(uint32_t SSANode) { uintptr_t ListBegin = DualListData.ListBegin(); Ref Node = reinterpret_cast(ListBegin + SSANode * sizeof(OrderedNode)); return Node; } Ref EmplaceOrphanedNode(Ref OldNode) { size_t Size = sizeof(OrderedNode); Ref Ptr = reinterpret_cast(DualListData.ListAllocate(Size)); memcpy(Ptr, OldNode, Size); return Ptr; } // MMX State can be either MMX (for 64bit) or x87 FPU (for 80bit) enum { MMXState_MMX, MMXState_X87 } MMXState = MMXState_MMX; // Overriden by dispatcher, stubbed for IR tests virtual void RecordX87Use() {} virtual void ChgStateX87_MMX() {} virtual void ChgStateMMX_X87() {} virtual void SaveNZCV(IROps Op) {} Ref CurrentWriteCursor = nullptr; // These could be combined with a little bit of work to be more efficient with memory usage. Isn't a big deal DualIntrusiveAllocatorThreadPool DualListData; Ref InvalidNode {}; Ref CurrentCodeBlock {}; fextl::vector CodeBlocks; uint64_t Entry {}; bool SupportsTSOImm9 {}; private: void ResetWorkingList(); }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/IntrusiveIRList.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Interface/IR/IR.h" #include #include #include #include #include #include #include namespace FEXCore::IR { /** * @brief This is purely an intrusive allocator * This doesn't support any form of ordering at all * Just provides a chunk of memory for allocating IR nodes from * * Can potentially support reallocation if we are smart and make sure to invalidate anything holding a true pointer */ class DualIntrusiveAllocator { public: [[nodiscard]] bool DataCheckSize(size_t Size) const { size_t NewOffset = DataCurrentOffset + Size; return NewOffset <= MemorySize; } [[nodiscard]] bool ListCheckSize(size_t Size) const { size_t NewOffset = ListCurrentOffset + Size; return NewOffset <= MemorySize; } [[nodiscard]] void* DataAllocate(size_t Size) { LOGMAN_THROW_A_FMT(DataCheckSize(Size), "Ran out of space in DualIntrusiveAllocator during allocation"); size_t NewOffset = DataCurrentOffset + Size; uintptr_t NewPointer = Data + DataCurrentOffset; DataCurrentOffset = NewOffset; return reinterpret_cast(NewPointer); } [[nodiscard]] void* ListAllocate(size_t Size) { LOGMAN_THROW_A_FMT(ListCheckSize(Size), "Ran out of space in DualIntrusiveAllocator during allocation"); size_t NewOffset = ListCurrentOffset + Size; uintptr_t NewPointer = List + ListCurrentOffset; ListCurrentOffset = NewOffset; return reinterpret_cast(NewPointer); } [[nodiscard]] size_t DataSize() const { return DataCurrentOffset; } [[nodiscard]] size_t DataBackingSize() const { return MemorySize; } [[nodiscard]] size_t ListSize() const { return ListCurrentOffset; } [[nodiscard]] size_t ListBackingSize() const { return MemorySize; } [[nodiscard]] uintptr_t DataBegin() const { return Data; } [[nodiscard]] uintptr_t ListBegin() const { return List; } void Reset() { DataCurrentOffset = 0; ListCurrentOffset = 0; } void CopyData(const DualIntrusiveAllocator& rhs) { DataCurrentOffset = rhs.DataCurrentOffset; ListCurrentOffset = rhs.ListCurrentOffset; memcpy(reinterpret_cast(Data), reinterpret_cast(rhs.Data), DataCurrentOffset); memcpy(reinterpret_cast(List), reinterpret_cast(rhs.List), ListCurrentOffset); } protected: DualIntrusiveAllocator(size_t Size) : MemorySize {Size} {} uintptr_t Data {}; uintptr_t List {}; size_t DataCurrentOffset {0}; size_t ListCurrentOffset {0}; size_t MemorySize {}; }; class DualIntrusiveAllocatorMalloc final : public DualIntrusiveAllocator { public: DualIntrusiveAllocatorMalloc(size_t Size) : DualIntrusiveAllocator {Size} { Data = reinterpret_cast(FEXCore::Allocator::malloc(Size * 2)); List = reinterpret_cast(Data + Size); } ~DualIntrusiveAllocatorMalloc() { FEXCore::Allocator::free(reinterpret_cast(Data)); } }; class DualIntrusiveAllocatorThreadPool final : public DualIntrusiveAllocator { public: DualIntrusiveAllocatorThreadPool(FEXCore::Utils::IntrusivePooledAllocator& ThreadAllocator, size_t Size) : DualIntrusiveAllocator {Size} , PoolObject {ThreadAllocator, Size * 2} {} void ReownOrClaimBuffer() { Data = PoolObject.ReownOrClaimBuffer(); List = Data + MemorySize; } void DelayedDisownBuffer() { PoolObject.DelayedDisownBuffer(); } private: Utils::PoolBufferWithTimedRetirement PoolObject; }; class IRListView final { public: IRListView() = delete; IRListView(DualIntrusiveAllocator* Data) : IRListView(reinterpret_cast(Data->DataBegin()), reinterpret_cast(Data->ListBegin()), Data->DataSize(), Data->ListSize()) {} IRListView(IRListView* Old) : IRListView(Old->IRDataInternal, Old->ListDataInternal, Old->DataSize, Old->ListSize) {} IRListView(void* IRData_, void* ListData_, size_t DataSize_, size_t ListSize_) : IRDataInternal(IRData_) , ListDataInternal(ListData_) , DataSize(DataSize_) , ListSize(ListSize_) {} [[nodiscard]] size_t GetInlineSize() const { static_assert(sizeof(*this) == 32); return sizeof(*this) + DataSize + ListSize; } [[nodiscard]] size_t GetDataSize() const { return DataSize; } [[nodiscard]] size_t GetListSize() const { return ListSize; } [[nodiscard]] size_t GetSSACount() const { return ListSize / sizeof(OrderedNode); } [[nodiscard]] NodeID GetID(const Ref Node) const { return Node->Wrapped(GetListData()).ID(); } [[nodiscard]] Ref GetHeaderNode() const { OrderedNodeWrapper Wrapped; Wrapped.NodeOffset = sizeof(OrderedNode); return Wrapped.GetNode(GetListData()); } [[nodiscard]] IROp_IRHeader* GetHeader() const { return GetOp(GetHeaderNode()); } [[nodiscard]] unsigned PostRA() const { return GetHeader()->PostRA; } [[nodiscard]] unsigned SpillSlots() const { return GetHeader()->SpillSlots; } template [[nodiscard]] T* GetOp(Ref Node) const { auto OpHeader = Node->Op(GetData()); auto Op = OpHeader->template CW(); // If we are casting to something narrower than just the header, check the opcode. if constexpr (!std::is_same::value) { LOGMAN_THROW_A_FMT(Op->OPCODE == Op->Header.Op, "Expected Node to be '{}'. Found '{}' instead", GetName(Op->OPCODE), GetName(Op->Header.Op)); } return Op; } template [[nodiscard]] T* GetOp(OrderedNodeWrapper Wrapper) const { auto Node = Wrapper.GetNode(GetListData()); return GetOp(Node); } [[nodiscard]] Ref GetNode(OrderedNodeWrapper Wrapper) const { return Wrapper.GetNode(GetListData()); } ///< Gets an OrderedNode from the IRListView as an OrderedNodeWrapper. [[nodiscard]] OrderedNodeWrapper WrapNode(Ref Node) const { return Node->Wrapped(GetListData()); } private: struct BlockRange { using iterator = NodeIterator; const IRListView* View; BlockRange(const IRListView* parent) : View(parent) {}; [[nodiscard]] iterator begin() const noexcept { auto Header = View->GetHeader(); return iterator(View->GetListData(), View->GetData(), Header->Blocks); } [[nodiscard]] iterator end() const noexcept { return iterator(View->GetListData(), View->GetData()); } }; struct CodeRange { using iterator = NodeIterator; const IRListView* View; const OrderedNodeWrapper BlockWrapper; CodeRange(const IRListView* parent, OrderedNodeWrapper block) : View(parent) , BlockWrapper(block) {}; [[nodiscard]] iterator begin() const noexcept { auto Block = View->GetOp(BlockWrapper); return iterator(View->GetListData(), View->GetData(), Block->Begin); } [[nodiscard]] iterator end() const noexcept { return iterator(View->GetListData(), View->GetData()); } }; struct AllCodeRange { using iterator = AllNodesIterator; // Diffrent Iterator const IRListView* View; AllCodeRange(const IRListView* parent) : View(parent) {}; [[nodiscard]] iterator begin() const noexcept { auto Header = View->GetHeader(); return iterator(View->GetListData(), View->GetData(), Header->Blocks); } [[nodiscard]] iterator end() const noexcept { return iterator(View->GetListData(), View->GetData()); } }; public: using iterator = NodeIterator; [[nodiscard]] BlockRange GetBlocks() const { return BlockRange(this); } [[nodiscard]] CodeRange GetCode(const Ref block) const { return CodeRange(this, block->Wrapped(GetListData())); } [[nodiscard]] AllCodeRange GetAllCode() const { return AllCodeRange(this); } [[nodiscard]] iterator begin() const noexcept { OrderedNodeWrapper Wrapped; Wrapped.NodeOffset = sizeof(OrderedNode); return iterator(GetListData(), GetData(), Wrapped); } /** * @brief This is not an iterator that you can reverse iterator through! * * @return Our iterator sentinel to ensure ending correctly */ [[nodiscard]] iterator end() const noexcept { OrderedNodeWrapper Wrapped; Wrapped.NodeOffset = 0; return iterator(GetListData(), GetData(), Wrapped); } /** * @brief Convert a OrderedNodeWrapper to an interator that we can iterate over * @return Iterator for this op */ [[nodiscard]] iterator at(OrderedNodeWrapper Wrapped) const noexcept { return iterator(GetListData(), GetData(), Wrapped); } [[nodiscard]] iterator at(NodeID ID) const noexcept { OrderedNodeWrapper Wrapped; Wrapped.NodeOffset = ID.Value * sizeof(OrderedNode); return iterator(GetListData(), GetData(), Wrapped); } [[nodiscard]] iterator at(const Ref Node) const noexcept { const auto ListData = GetListData(); auto Wrapped = Node->Wrapped(ListData); return iterator(ListData, GetData(), Wrapped); } [[nodiscard]] uintptr_t GetData() const { return reinterpret_cast(IRDataInternal ? IRDataInternal : InlineData); } [[nodiscard]] uintptr_t GetListData() const { return reinterpret_cast(ListDataInternal ? ListDataInternal : &InlineData[DataSize]); } private: void* IRDataInternal; void* ListDataInternal; size_t DataSize; size_t ListSize; uint8_t InlineData[0]; }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/PassManager.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ meta: ir|opts ~ IR to IR Optimization tags: ir|opts desc: Defines which passes are run, and runs them $end_info$ */ #include "Interface/Context/Context.h" #include "Interface/IR/PassManager.h" #include "Interface/IR/Passes.h" #include "Interface/IR/Passes/RegisterAllocationPass.h" #include #include namespace FEXCore::IR { class IREmitter; void PassManager::Finalize() { if (!PassManagerDumpIR()) { // Not configured to dump any IR, just return. return; } auto it = Passes.begin(); // Walk the passes and add them where asked. if (PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::BEFOREOPT) { // Insert at the start. it = InsertAt(it, Debug::CreateIRDumper()); ++it; // Skip what we inserted. } if ((PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::BEFOREPASS) || (PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::AFTERPASS)) { bool SkipFirstBefore = PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::BEFOREOPT; for (; it != Passes.end();) { if (PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::BEFOREPASS) { if (SkipFirstBefore) { // If we need to skip the first one, then continue. SkipFirstBefore = false; ++it; continue; } // Insert before it = InsertAt(it, Debug::CreateIRDumper()); ++it; // Skip what we inserted. } ++it; // Skip current pass. if (PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::AFTERPASS) { // Insert after it = InsertAt(it, Debug::CreateIRDumper()); ++it; // Skip what we inserted. } } } if (PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::AFTEROPT) { if (!(PassManagerDumpIR() & FEXCore::Config::PassManagerDumpIR::AFTERPASS)) { // Insert final IRDumper. InsertAt(Passes.end(), Debug::CreateIRDumper()); } } } void PassManager::AddDefaultPasses(FEXCore::Context::ContextImpl* ctx) { FEX_CONFIG_OPT(DisablePasses, O0); if (!DisablePasses()) { InsertPass(CreateX87StackOptimizationPass(ctx->HostFeatures, ctx->Config.Is64BitMode ? IR::OpSize::i64Bit : IR::OpSize::i32Bit)); InsertPass(CreateDeadFlagCalculationEliminination()); } } void PassManager::AddDefaultValidationPasses() { #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED InsertValidationPass(Validation::CreateIRValidation(), "IRValidation"); #endif } void PassManager::InsertRegisterAllocationPass(FEXCore::Context::ContextImpl* ctx) { InsertPass(IR::CreateRegisterAllocationPass(&ctx->CPUID), "RA"); } void PassManager::Run(IREmitter* IREmit) { FEXCORE_PROFILE_SCOPED("PassManager::Run"); for (const auto& Pass : Passes) { Pass->Run(IREmit); } #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED for (const auto& Pass : ValidationPasses) { Pass->Run(IREmit); } #endif } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/PassManager.h ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: ir|opts $end_info$ */ #pragma once #include #include #include #include #include #include #include #include namespace FEXCore::Context { class ContextImpl; } namespace FEXCore::HLE { class SyscallHandler; } namespace FEXCore::IR { class PassManager; class IREmitter; class Pass { public: virtual ~Pass() = default; virtual void Run(IREmitter* IREmit) = 0; void RegisterPassManager(PassManager* _Manager) { Manager = _Manager; } protected: PassManager* Manager {}; }; class PassManager final { public: void AddDefaultPasses(FEXCore::Context::ContextImpl* ctx); void AddDefaultValidationPasses(); Pass* InsertPass(fextl::unique_ptr Pass, fextl::string Name = "") { auto PassPtr = InsertAt(Passes.end(), std::move(Pass))->get(); if (!Name.empty()) { NameToPassMaping[Name] = PassPtr; } return PassPtr; } void InsertRegisterAllocationPass(FEXCore::Context::ContextImpl* ctx); void Run(IREmitter* IREmit); bool HasPass(fextl::string Name) const { return NameToPassMaping.contains(Name); } template T* GetPass(fextl::string Name) { return dynamic_cast(NameToPassMaping[Name]); } Pass* GetPass(fextl::string Name) { return NameToPassMaping[Name]; } void RegisterSyscallHandler(FEXCore::HLE::SyscallHandler* Handler) { SyscallHandler = Handler; } void Finalize(); protected: FEXCore::HLE::SyscallHandler* SyscallHandler {}; private: using PassArrayType = fextl::vector>; PassArrayType::iterator InsertAt(PassArrayType::iterator pos, fextl::unique_ptr Pass) { Pass->RegisterPassManager(this); return Passes.insert(pos, std::move(Pass)); } PassArrayType Passes; fextl::unordered_map NameToPassMaping; #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED fextl::vector> ValidationPasses; void InsertValidationPass(fextl::unique_ptr Pass, fextl::string Name = "") { Pass->RegisterPassManager(this); auto PassPtr = ValidationPasses.emplace_back(std::move(Pass)).get(); if (!Name.empty()) { NameToPassMaping[Name] = PassPtr; } } #endif FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); FEX_CONFIG_OPT(PassManagerDumpIR, PASSMANAGERDUMPIR); }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/Passes/IRDumperPass.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: ir|debug desc: Prints IR $end_info$ */ #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" #include "Interface/IR/Passes/RegisterAllocationPass.h" #include "Interface/Core/OpcodeDispatcher.h" #include namespace FEXCore::IR::Debug { class IRDumper final : public FEXCore::IR::Pass { public: IRDumper(); void Run(IREmitter* IREmit) override; private: FEX_CONFIG_OPT(DumpIR, DUMPIR); bool DumpToFile {}; bool DumpToLog {}; }; IRDumper::IRDumper() { const auto& DumpIRStr = DumpIR(); if (DumpIRStr == "stderr" || DumpIRStr == "stdout" || DumpIRStr == "no") { // Intentionally do nothing } else if (DumpIRStr == "server") { DumpToLog = true; } else { DumpToFile = true; } } void IRDumper::Run(IREmitter* IREmit) { FEXCore::File::File FD {}; if (DumpIR() == "stderr") { FD = FEXCore::File::File::GetStdERR(); } else if (DumpIR() == "stdout") { FD = FEXCore::File::File::GetStdOUT(); } auto IR = IREmit->ViewIR(); auto HeaderOp = IR.GetHeader(); LOGMAN_THROW_A_FMT(HeaderOp->Header.Op == OP_IRHEADER, "First op wasn't IRHeader"); // DumpIRStr might be no if not dumping but ShouldDump is set in OpDisp if (DumpToFile) { const auto fileName = fextl::fmt::format("{}/{:x}{}", DumpIR(), +HeaderOp->OriginalRIP, IR.PostRA() ? "-post.ir" : "-pre.ir"); FD = FEXCore::File::File(fileName.c_str(), FEXCore::File::FileModes::WRITE | FEXCore::File::FileModes::CREATE | FEXCore::File::FileModes::TRUNCATE); } if (FD.IsValid() || DumpToLog) { fextl::stringstream out; FEXCore::IR::Dump(&out, &IR); if (FD.IsValid()) { fextl::fmt::print(FD, "IR-{} 0x{:x}:\n{}\n@@@@@\n", IR.PostRA() ? "post" : "pre", +HeaderOp->OriginalRIP, out.str()); } else { LogMan::Msg::IFmt("IR-{} 0x{:x}:\n{}\n@@@@@\n", IR.PostRA() ? "post" : "pre", +HeaderOp->OriginalRIP, out.str()); } } } fextl::unique_ptr CreateIRDumper() { return fextl::make_unique(); } } // namespace FEXCore::IR::Debug ================================================ FILE: FEXCore/Source/Interface/IR/Passes/IRValidation.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: ir|opts desc: Sanity checking pass $end_info$ */ #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" #include "Interface/IR/RegisterAllocationData.h" #include "Interface/IR/Passes/IRValidation.h" #include "Interface/IR/Passes/RegisterAllocationPass.h" #include #include #include #include #include #include #include #include #include #include namespace FEXCore::IR::Validation { IRValidation::~IRValidation() { NodeIsLive.Free(); } void IRValidation::Run(IREmitter* IREmit) { FEXCORE_PROFILE_SCOPED("PassManager::IRValidation"); bool HadError = false; bool HadWarning = false; fextl::ostringstream Errors; fextl::ostringstream Warnings; auto CurrentIR = IREmit->ViewIR(); OffsetToBlockMap.clear(); EntryBlock = nullptr; uint32_t Count = CurrentIR.GetSSACount(); if (Count > MaxNodes) { NodeIsLive.Realloc(Count); } fextl::vector Uses(Count, 0); #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED auto HeaderOp = CurrentIR.GetHeader(); LOGMAN_THROW_A_FMT(HeaderOp->Header.Op == OP_IRHEADER, "First op wasn't IRHeader"); #endif for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { auto BlockIROp = BlockHeader->CW(); LOGMAN_THROW_A_FMT(BlockIROp->Header.Op == OP_CODEBLOCK, "IR type failed to be a code block"); if (!EntryBlock) { EntryBlock = BlockNode; } const auto BlockID = CurrentIR.GetID(BlockNode); BlockInfo* CurrentBlock = &OffsetToBlockMap.try_emplace(BlockID).first->second; // We only allow defs local to a single block, so clear live set per block NodeIsLive.MemClear(Count); for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { const auto ID = CurrentIR.GetID(CodeNode); const auto OpSize = IROp->Size; if (GetHasDest(IROp->Op)) { HadError |= OpSize == IR::OpSize::iInvalid; // Does the op have a destination of size 0? if (OpSize == IR::OpSize::iInvalid) { Errors << "%" << ID << ": Had destination but with no size" << std::endl; } // Does the node have zero uses? Should have been DCE'd if (CodeNode->GetUses() == 0) { HadWarning |= true; Warnings << "%" << ID << ": Destination created but had no uses" << std::endl; } if (CurrentIR.PostRA()) { // After RA, the destination needs to be assigned a register and class auto PhyReg = PhysicalRegister(CodeNode); const auto ExpectedClass = IR::GetRegClass(IROp->Op); const auto AssignedClass = PhyReg.AsRegClass(); // If no register class was assigned if (AssignedClass == IR::RegClass::Invalid) { HadError |= true; Errors << "%" << ID << ": Had destination but with no register class assigned" << std::endl; } // If no physical register was assigned if (PhyReg.IsInvalid()) { HadError |= true; Errors << "%" << ID << ": Had destination but with no register assigned" << std::endl; } // Assigned class wasn't the expected class and it is a non-complex op if (AssignedClass != ExpectedClass && ExpectedClass != IR::RegClass::Complex) { HadWarning |= true; Warnings << "%" << ID << ": Destination had register class " << uint32_t(AssignedClass) << " When register class " << uint32_t(ExpectedClass) << " Was expected" << std::endl; } } } uint8_t NumArgs = IR::GetRAArgs(IROp->Op); for (uint32_t i = 0; i < NumArgs; ++i) { OrderedNodeWrapper Arg = IROp->Args[i]; const auto ArgID = Arg.ID(); if (Arg.IsImmediate()) { continue; } IROps Op = CurrentIR.GetOp(Arg)->Op; if (ArgID.IsValid()) { Uses[ArgID.Value]++; } // We do not validate the location of inline constants because it's // irrelevant, they're ignored by RA and always inlined to where they // need to be. This lets us pool inline constants globally. bool Ignore = (Op == OP_IRHEADER || Op == OP_INLINECONSTANT); if (!Ignore && ArgID.IsValid() && !NodeIsLive.Get(ArgID.Value)) { HadError |= true; Errors << "%" << ID << ": Arg[" << i << "] references invalid %" << ArgID << std::endl; } } NodeIsLive.Set(ID.Value); switch (IROp->Op) { case IR::OP_EXITFUNCTION: { CurrentBlock->HasExit = true; break; } case IR::OP_CONDJUMP: { auto Op = IROp->C(); OrderedNode* TrueTargetNode = CurrentIR.GetNode(Op->TrueBlock); OrderedNode* FalseTargetNode = CurrentIR.GetNode(Op->FalseBlock); CurrentBlock->Successors.emplace_back(TrueTargetNode); CurrentBlock->Successors.emplace_back(FalseTargetNode); const FEXCore::IR::IROp_Header* TrueTargetOp = CurrentIR.GetOp(TrueTargetNode); const FEXCore::IR::IROp_Header* FalseTargetOp = CurrentIR.GetOp(FalseTargetNode); if (TrueTargetOp->Op != OP_CODEBLOCK) { HadError |= true; Errors << "CondJump %" << ID << ": True Target Jumps to Op that isn't the begining of a block" << std::endl; } else { auto Block = OffsetToBlockMap.try_emplace(Op->TrueBlock.ID()).first; Block->second.Predecessors.emplace_back(BlockNode); } if (FalseTargetOp->Op != OP_CODEBLOCK) { HadError |= true; Errors << "CondJump %" << ID << ": False Target Jumps to Op that isn't the begining of a block" << std::endl; } else { auto Block = OffsetToBlockMap.try_emplace(Op->FalseBlock.ID()).first; Block->second.Predecessors.emplace_back(BlockNode); } break; } case IR::OP_JUMP: { auto Op = IROp->C(); OrderedNode* TargetNode = CurrentIR.GetNode(Op->Header.Args[0]); CurrentBlock->Successors.emplace_back(TargetNode); const FEXCore::IR::IROp_Header* TargetOp = CurrentIR.GetOp(TargetNode); if (TargetOp->Op != OP_CODEBLOCK) { HadError |= true; Errors << "Jump %" << ID << ": Jump to Op that isn't the begining of a block" << std::endl; } else { auto Block = OffsetToBlockMap.try_emplace(Op->Header.Args[0].ID()).first; Block->second.Predecessors.emplace_back(BlockNode); } break; } default: // LOGMAN_MSG_A_FMT("Unknown IR Op: {}({})", IROp->Op, FEXCore::IR::GetName(IROp->Op)); break; } } // Blocks can only have zero (Exit), 1 (Unconditional branch) or 2 (Conditional) successors size_t NumSuccessors = CurrentBlock->Successors.size(); if (NumSuccessors > 2) { HadError |= true; Errors << "%" << BlockID << " Has " << NumSuccessors << " successors which is too many" << std::endl; } { auto GetOp = [](auto Code) { auto [CodeNode, IROp] = Code(); return IROp->Op; }; auto CodeCurrent = CurrentIR.at(BlockIROp->Last); // Last instruction in the block must be EndBlock { auto Op = GetOp(CodeCurrent); if (Op != IR::OP_ENDBLOCK) { HadError |= true; Errors << "%" << BlockID << " Failed to end block with EndBlock" << std::endl; } } --CodeCurrent; // Blocks need to have an instruction that leaves the block in some way before the EndBlock instruction { auto Op = GetOp(CodeCurrent); if (!IsBlockExit(Op)) { HadError |= true; Errors << "%" << BlockID << " Didn't have a block exit IR op as its last instruction" << std::endl; } } } } // Use counts are only relevant pre-RA. if (!CurrentIR.PostRA()) { for (uint32_t i = 0; i < CurrentIR.GetSSACount(); i++) { auto [Node, IROp] = CurrentIR.at(IR::NodeID {i})(); if (Node->NumUses != Uses[i] && IROp->Op != OP_CODEBLOCK && IROp->Op != OP_IRHEADER) { HadError |= true; Errors << "%" << i << " Has " << Uses[i] << " Uses, but reports " << Node->NumUses << std::endl; } } } HadWarning = false; if (HadError || HadWarning) { fextl::stringstream Out; FEXCore::IR::Dump(&Out, &CurrentIR); if (HadError) { Out << "Errors:" << std::endl << Errors.str() << std::endl; } if (HadWarning) { Out << "Warnings:" << std::endl << Warnings.str() << std::endl; } LogMan::Msg::EFmt("{}", Out.str()); LOGMAN_MSG_A_FMT("Encountered IR validation Error"); Errors.clear(); Warnings.clear(); } } fextl::unique_ptr CreateIRValidation() { return fextl::make_unique(); } } // namespace FEXCore::IR::Validation ================================================ FILE: FEXCore/Source/Interface/IR/Passes/IRValidation.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "Common/BitSet.h" #include #include #include namespace FEXCore::IR::Validation { struct BlockInfo { bool HasExit; const OrderedNode* BlockNode; fextl::vector Predecessors; fextl::vector Successors; }; class IRValidation final : public FEXCore::IR::Pass { public: ~IRValidation(); void Run(IREmitter* IREmit) override; private: BitSet NodeIsLive {}; OrderedNode* EntryBlock {}; fextl::unordered_map OffsetToBlockMap; size_t MaxNodes {}; }; } // namespace FEXCore::IR::Validation ================================================ FILE: FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: ir|opts $end_info$ */ #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" #include #include #include #include #include #include #include // Flag bit flags #define FLAG_V (1U << 0) #define FLAG_C (1U << 1) #define FLAG_Z (1U << 2) #define FLAG_N (1U << 3) #define FLAG_P (1U << 4) #define FLAG_A (1U << 5) #define FLAG_ZCV (FLAG_Z | FLAG_C | FLAG_V) #define FLAG_NZCV (FLAG_N | FLAG_ZCV) #define FLAG_ALL (FLAG_NZCV | FLAG_A | FLAG_P) namespace FEXCore::IR { struct FlagInfoUnpacked { // Set of flags read by the instruction. unsigned Read; // Set of flags written by the instruction. Happens AFTER the reads. unsigned Write; // If true, the instruction can be be eliminated if its flag writes can all be // eliminated. bool CanEliminate; // If set, the opcode can be replaced with Replacement if its flag writes can // all be eliminated, or ReplacementNoWrite if its register write can be // eliminated. IROps Replacement; IROps ReplacementNoWrite; // Needs speical handling bool Special; }; struct FlagInfo { uint64_t Raw; static constexpr struct FlagInfo Pack(struct FlagInfoUnpacked F) { uint64_t R = F.Read | (F.Write << 8) | (F.CanEliminate << 16) | (((uint64_t)F.Replacement) << 32) | ((uint64_t)F.ReplacementNoWrite << 48) | (F.Special ? (1ull << 63) : 0); return {.Raw = R}; } bool Trivial() const { return Raw == 0; } unsigned Read() const { return Bits(0, 8); } unsigned Write() const { return Bits(8, 8); } bool CanEliminate() const { return Bits(16, 1); } bool Special() const { return Bits(63, 1); } IROps Replacement() const { return (IROps)Bits(32, 16); } IROps ReplacementNoWrite() const { return (IROps)Bits(48, 16); } private: unsigned Bits(unsigned Start, unsigned Count) const { return (Raw >> Start) & ((1u << Count) - 1); } }; struct BlockInfo { fextl::vector Predecessors; Ref Node; uint8_t Flags; bool InWorklist; }; struct ControlFlowGraph { fextl::vector BlockMap; IRListView& IR; void Init(fextl::deque& Worklist, uint32_t BlockCount) { BlockMap.resize(BlockCount); for (unsigned ID = 0; ID < BlockCount; ++ID) { // Add the block with conservative flags and already in the worklist. auto Info = BlockInfo {{}, nullptr, FLAG_ALL, true}; // Add some initial capacity Info.Predecessors.reserve(2); BlockMap[ID] = std::move(Info); Worklist.push_back(ID); } } BlockInfo* Get(uint32_t Block) { return &BlockMap[Block]; } BlockInfo* Get(IROp_CodeBlock* Block) { return &BlockMap[Block->ID]; } BlockInfo* Get(OrderedNodeWrapper Block) { return Get(IR.GetOp(Block)); } void RecordEdge(uint32_t From, OrderedNodeWrapper To) { auto Info = Get(To); Info->Predecessors.push_back(From); } void AddWorklist(fextl::deque& Worklist, uint32_t Block) { auto Info = Get(Block); if (!Info->InWorklist) { Info->InWorklist = true; Worklist.push_front(Block); } } }; class DeadFlagCalculationEliminination final : public FEXCore::IR::Pass { public: void Run(IREmitter* IREmit) override; private: FlagInfo Classify(IROp_Header* Node); unsigned FlagsForCondClassType(CondClass Cond); bool EliminateDeadCode(IREmitter* IREmit, Ref CodeNode, IROp_Header* IROp); void FoldBranch(IREmitter* IREmit, IRListView& CurrentIR, IROp_CondJump* Op, Ref CodeNode); CondClass X86ToArmFloatCond(CondClass X86); bool ProcessBlock(IREmitter* IREmit, IRListView& CurrentIR, Ref Block, ControlFlowGraph& CFG); void OptimizeParity(IREmitter* IREmit, IRListView& CurrentIR, ControlFlowGraph& CFG); }; unsigned DeadFlagCalculationEliminination::FlagsForCondClassType(CondClass Cond) { switch (Cond) { case CondClass::AL: return 0; case CondClass::MI: case CondClass::PL: return FLAG_N; case CondClass::EQ: case CondClass::NEQ: return FLAG_Z; case CondClass::UGE: case CondClass::ULT: return FLAG_C; case CondClass::VS: case CondClass::VC: case CondClass::FU: case CondClass::FNU: return FLAG_V; case CondClass::UGT: case CondClass::ULE: return FLAG_Z | FLAG_C; case CondClass::SGE: case CondClass::SLT: case CondClass::FLU: case CondClass::FGE: return FLAG_N | FLAG_V; case CondClass::SGT: case CondClass::SLE: case CondClass::FLEU: case CondClass::FGT: return FLAG_N | FLAG_Z | FLAG_V; default: LOGMAN_THROW_A_FMT(false, "unknown cond class type"); return FLAG_NZCV; } } constexpr FlagInfo ClassifyConst(IROps Op) { switch (Op) { case OP_ANDWITHFLAGS: return FlagInfo::Pack({ .Write = FLAG_NZCV, .Replacement = OP_AND, .ReplacementNoWrite = OP_TESTNZ, }); case OP_ADDWITHFLAGS: return FlagInfo::Pack({ .Write = FLAG_NZCV, .Replacement = OP_ADD, .ReplacementNoWrite = OP_ADDNZCV, }); case OP_SUBWITHFLAGS: return FlagInfo::Pack({ .Write = FLAG_NZCV, .Replacement = OP_SUB, .ReplacementNoWrite = OP_SUBNZCV, }); case OP_ADCWITHFLAGS: return FlagInfo::Pack({ .Read = FLAG_C, .Write = FLAG_NZCV, .Replacement = OP_ADC, .ReplacementNoWrite = OP_ADCNZCV, }); case OP_ADCZEROWITHFLAGS: return FlagInfo::Pack({ .Read = FLAG_C, .Write = FLAG_NZCV, .Replacement = OP_ADCZERO, }); case OP_SBBWITHFLAGS: return FlagInfo::Pack({ .Read = FLAG_C, .Write = FLAG_NZCV, .Replacement = OP_SBB, .ReplacementNoWrite = OP_SBBNZCV, }); case OP_SHIFTFLAGS: // _ShiftFlags conditionally sets NZCV+PF, which we model here as a // read-modify-write. Logically, it also conditionally makes AF undefined, // which we model by omitting AF from both Read and Write sets (since // "cond ? AF : undef" may be optimized to "AF"). return FlagInfo::Pack({ .Read = FLAG_NZCV | FLAG_P, .Write = FLAG_NZCV | FLAG_P, .CanEliminate = true, }); case OP_ROTATEFLAGS: // _RotateFlags conditionally sets CV, again modeled as RMW. return FlagInfo::Pack({ .Read = FLAG_C | FLAG_V, .Write = FLAG_C | FLAG_V, .CanEliminate = true, }); case OP_RDRAND: return FlagInfo::Pack({.Write = FLAG_NZCV}); case OP_ADDNZCV: case OP_SUBNZCV: case OP_TESTNZ: case OP_FCMP: case OP_STORENZCV: return FlagInfo::Pack({ .Write = FLAG_NZCV, .CanEliminate = true, }); case OP_AXFLAG: // Per the Arm spec, axflag reads Z/V/C but not N. It writes all flags. return FlagInfo::Pack({ .Read = FLAG_ZCV, .Write = FLAG_NZCV, .CanEliminate = true, }); case OP_CMPPAIRZ: return FlagInfo::Pack({ .Write = FLAG_Z, .CanEliminate = true, }); case OP_CARRYINVERT: return FlagInfo::Pack({ .Read = FLAG_C, .Write = FLAG_C, .CanEliminate = true, }); case OP_SETSMALLNZV: return FlagInfo::Pack({ .Write = FLAG_N | FLAG_Z | FLAG_V, .CanEliminate = true, }); case OP_LOADNZCV: return FlagInfo::Pack({.Read = FLAG_NZCV}); case OP_ADC: case OP_ADCZERO: case OP_SBB: return FlagInfo::Pack({.Read = FLAG_C}); case OP_ADCNZCV: case OP_SBBNZCV: return FlagInfo::Pack({ .Read = FLAG_C, .Write = FLAG_NZCV, .CanEliminate = true, }); case OP_LOADPF: return FlagInfo::Pack({.Read = FLAG_P}); case OP_LOADAF: return FlagInfo::Pack({.Read = FLAG_A}); case OP_STOREPF: return FlagInfo::Pack({.Write = FLAG_P, .CanEliminate = true}); case OP_STOREAF: return FlagInfo::Pack({.Write = FLAG_A, .CanEliminate = true}); case OP_NZCVSELECT: case OP_NZCVSELECTV: case OP_NZCVSELECTINCREMENT: case OP_NEG: case OP_CONDJUMP: case OP_CONDSUBNZCV: case OP_CONDADDNZCV: case OP_RMIFNZCV: case OP_INVALIDATEFLAGS: return FlagInfo::Pack({.Special = true}); default: return FlagInfo::Pack({}); } } constexpr auto FlagInfos = std::invoke([] { std::array ret = {}; for (unsigned i = 0; i < OP_LAST; ++i) { ret[i] = ClassifyConst((IROps)i); } return ret; }); FlagInfo DeadFlagCalculationEliminination::Classify(IROp_Header* IROp) { FlagInfo Info = FlagInfos[IROp->Op]; if (!Info.Special()) { return Info; } switch (IROp->Op) { case OP_NZCVSELECT: case OP_NZCVSELECTINCREMENT: { auto Op = IROp->CW(); return FlagInfo::Pack({.Read = FlagsForCondClassType(Op->Cond)}); } case OP_NZCVSELECTV: { auto Op = IROp->CW(); return FlagInfo::Pack({.Read = FlagsForCondClassType(Op->Cond)}); } case OP_NEG: { auto Op = IROp->CW(); return FlagInfo::Pack({.Read = FlagsForCondClassType(Op->Cond)}); } case OP_CONDJUMP: { auto Op = IROp->CW(); if (!Op->FromNZCV) { return FlagInfo::Pack({}); } return FlagInfo::Pack({.Read = FlagsForCondClassType(Op->Cond)}); } case OP_CONDSUBNZCV: case OP_CONDADDNZCV: { auto Op = IROp->CW(); return FlagInfo::Pack({ .Read = FlagsForCondClassType(Op->Cond), .Write = FLAG_NZCV, .CanEliminate = true, }); } case OP_RMIFNZCV: { auto Op = IROp->CW(); static_assert(FLAG_N == (1 << 3), "rmif mask lines up with our bits"); static_assert(FLAG_Z == (1 << 2), "rmif mask lines up with our bits"); static_assert(FLAG_C == (1 << 1), "rmif mask lines up with our bits"); static_assert(FLAG_V == (1 << 0), "rmif mask lines up with our bits"); return FlagInfo::Pack({ .Write = Op->Mask, .CanEliminate = true, }); } case OP_INVALIDATEFLAGS: { auto Op = IROp->CW(); unsigned Flags = 0; // TODO: Make this translation less silly if (Op->Flags & (1u << X86State::RFLAG_SF_RAW_LOC)) { Flags |= FLAG_N; } if (Op->Flags & (1u << X86State::RFLAG_ZF_RAW_LOC)) { Flags |= FLAG_Z; } if (Op->Flags & (1u << X86State::RFLAG_CF_RAW_LOC)) { Flags |= FLAG_C; } if (Op->Flags & (1u << X86State::RFLAG_OF_RAW_LOC)) { Flags |= FLAG_V; } if (Op->Flags & (1u << X86State::RFLAG_PF_RAW_LOC)) { Flags |= FLAG_P; } if (Op->Flags & (1u << X86State::RFLAG_AF_RAW_LOC)) { Flags |= FLAG_A; } // The mental model of InvalidateFlags is writing undefined values to all // of the selected flags, allowing the write-after-write optimizations to // optimize invalidate-after-write for free. return FlagInfo::Pack({ .Write = Flags, .CanEliminate = true, }); } default: LOGMAN_THROW_A_FMT(false, "invalid special op"); FEX_UNREACHABLE; } FEX_UNREACHABLE; } // General purpose dead code elimination. Returns whether flag handling should // be skipped (because it was removed or could not possibly affect flags). bool DeadFlagCalculationEliminination::EliminateDeadCode(IREmitter* IREmit, Ref CodeNode, IROp_Header* IROp) { // Can't remove anything used or with side effects. if (CodeNode->GetUses() > 0 || IR::HasSideEffects(IROp->Op)) { return false; } IREmit->Remove(CodeNode); return true; } CondClass DeadFlagCalculationEliminination::X86ToArmFloatCond(CondClass X86) { // Table of x86 condition codes that map to arm64 condition codes, in the // sense that fcmp+axflag+branch(x86) is equivalent to fcmp+branch(arm). // // E would be "equal or unordered", no condition code. // G would be "greater than or less than", no condition code. // // SF/OF conditions are trivial and therefore shouldn't actually be generated switch (X86) { case CondClass::UGE /* A */: return CondClass::FGE /* GE */; case CondClass::UGT /* AE */: return CondClass::FGT /* GT */; case CondClass::ULT /* B */: return CondClass::SLT /* LT */; case CondClass::ULE /* BE */: return CondClass::SLE /* LE */; case CondClass::SLE /* LE */: return CondClass::SLE /* LE */; default: return CondClass::AL; } } void DeadFlagCalculationEliminination::FoldBranch(IREmitter* IREmit, IRListView& CurrentIR, IROp_CondJump* Op, Ref CodeNode) { // Skip past StoreRegisters at the end -- they don't touch flags. auto PrevWrap = CodeNode->Header.Previous; while (CurrentIR.GetOp(PrevWrap)->Op == OP_STOREREGISTER || CurrentIR.GetOp(PrevWrap)->Op == OP_STOREPF || CurrentIR.GetOp(PrevWrap)->Op == OP_STOREAF) { PrevWrap = CurrentIR.GetNode(PrevWrap)->Header.Previous; } auto Prev = CurrentIR.GetOp(PrevWrap); if (Prev->Op == OP_AXFLAG) { // Pattern match a branch fed by AXFLAG. CondClass ArmCond = X86ToArmFloatCond(Op->Cond); if (ArmCond == CondClass::AL) { return; } Op->Cond = ArmCond; } else if (Prev->Op == OP_SUBNZCV) { // Pattern match a branch fed by a compare. We could also handle bit tests // here, but tbz/tbnz has a limited offset range which we don't have a way to // deal with yet. Let's hope that's not a big deal. if (!(Op->Cond == CondClass::NEQ || Op->Cond == CondClass::EQ) || (Prev->Size < OpSize::i32Bit)) { return; } auto SecondArg = CurrentIR.GetOp(Prev->Args[1]); if (SecondArg->Op != OP_INLINECONSTANT || SecondArg->C()->Constant != 0) { return; } // We've matched. Fold the compare into branch. IREmit->ReplaceNodeArgument(CodeNode, 0, CurrentIR.GetNode(Prev->Args[0])); IREmit->ReplaceNodeArgument(CodeNode, 1, CurrentIR.GetNode(Prev->Args[1])); Op->FromNZCV = false; Op->CompareSize = Prev->Size; } else { return; } // The compare/test/axflag sets flags but does not write registers. Flags are // dead after the jump. The jump does not read flags anymore. There is no // intervening instruction. Therefore the compare is dead. IREmit->Remove(CurrentIR.GetNode(PrevWrap)); } /** * @brief This pass removes dead code locally. */ bool DeadFlagCalculationEliminination::ProcessBlock(IREmitter* IREmit, IRListView& CurrentIR, Ref Block, ControlFlowGraph& CFG) { uint32_t FlagsRead = FLAG_ALL; // Reverse iteration is not yet working with the iterators auto BlockIROp = CurrentIR.GetOp(Block); // We grab these nodes this way so we can iterate easily auto CodeBegin = CurrentIR.at(BlockIROp->Begin); auto CodeLast = CurrentIR.at(BlockIROp->Last); // Advance past EndBlock to get at the exit. --CodeLast; // Initialize the FlagsRead mask according to the exit instruction. auto [ExitNode, ExitOp] = CodeLast(); if (ExitOp->Op == IR::OP_CONDJUMP) { auto Op = ExitOp->CW(); FlagsRead = CFG.Get(Op->TrueBlock)->Flags | CFG.Get(Op->FalseBlock)->Flags; } else if (ExitOp->Op == IR::OP_JUMP) { FlagsRead = CFG.Get(ExitOp->Args[0])->Flags; } // Iterate the block in reverse while (true) { auto [CodeNode, IROp] = CodeLast(); // Optimizing flags can cause earlier flag reads to become dead but dead // flag reads should not impede optimiation of earlier dead flag writes. // We must DCE as we go to ensure we converge in a single iteration. if (!EliminateDeadCode(IREmit, CodeNode, IROp)) { // Optimiation algorithm: For each flag written... // // If the flag has a later read (per FlagsRead), remove the flag from // FlagsRead, since the reader is covered by this write. // // Else, there is no later read, so remove the flag write (if we can). // This is the active part of the optimization. // // Then, add each flag read to FlagsRead. // // This order is important: instructions that read-modify-write flags // (like adcs) first read flags, then write flags. Since we're iterating // the block backwards, that means we handle the write first. struct FlagInfo Info = Classify(IROp); if (!Info.Trivial()) { bool Eliminated = false; if ((FlagsRead & Info.Write()) == 0) { if ((Info.CanEliminate() || Info.Replacement()) && CodeNode->GetUses() == 0) { IREmit->Remove(CodeNode); Eliminated = true; } else if (Info.Replacement()) { IROp->Op = Info.Replacement(); } } else if (Info.ReplacementNoWrite() && CodeNode->GetUses() == 0) { IROp->Op = Info.ReplacementNoWrite(); } // If we don't care about the sign or carry, we can optimize testnz. // Carry is inverted between testz and testnz so we check that too. Note // this flag is outside of the if, since the TestNZ might result from // optimizing AndWithFlags, and we need to converge locally in a single // iteration. if (IROp->Op == OP_TESTNZ && IROp->Size < OpSize::i32Bit && !(FlagsRead & (FLAG_N | FLAG_C))) { IROp->Op = OP_TESTZ; } FlagsRead &= ~Info.Write(); // If we eliminated the instruction, we eliminate its read too. This // check is required to ensure the pass converges locally in a single // iteration. if (!Eliminated) { FlagsRead |= Info.Read(); } } } // Iterate in reverse if (CodeLast == CodeBegin) { break; } --CodeLast; } // For the purposes of global propagation, the content of our progress doesn't // matter -- only the difference in our final FlagsRead contributes to changes // in the predecessors. uint32_t OldFlagsRead = CFG.Get(BlockIROp->ID)->Flags; CFG.Get(BlockIROp->ID)->Flags = FlagsRead; return (OldFlagsRead != FlagsRead); } void DeadFlagCalculationEliminination::OptimizeParity(IREmitter* IREmit, IRListView& CurrentIR, ControlFlowGraph& CFG) { // Mapping for flags inside this pass. const uint8_t PARTIAL = 0; const uint8_t FULL = 1; // Initialize conservatively: all blocks need full parity. This initialization // matters for proper handling of backedges. for (auto [Block, BlockHeader] : CurrentIR.GetBlocks()) { auto ID = BlockHeader->C()->ID; CFG.Get(ID)->Flags = FULL; } for (auto [Block, BlockHeader] : CurrentIR.GetBlocks()) { const auto ID = BlockHeader->C()->ID; const auto& Predecessors = CFG.Get(ID)->Predecessors; bool Full = false; if (Predecessors.empty()) { // Conservatively assume there was full parity before the start block Full = true; } else { // If any predecessor needs full parity at the end, we need full parity. for (auto Pred : Predecessors) { Full |= (CFG.Get(Pred)->Flags == FULL); } } for (auto [CodeNode, IROp] : CurrentIR.GetCode(Block)) { if (IROp->Op == OP_STOREPF) { auto Op = IROp->CW(); auto Generator = CurrentIR.GetOp(Op->Value); // Determine if we only write 0/1 to the parity flag. Full = true; if (Generator->Op == OP_NZCVSELECT) { auto C0 = CurrentIR.GetOp(Generator->Args[0]); auto C1 = CurrentIR.GetOp(Generator->Args[1]); if (C0->Op == C1->Op && C0->Op == OP_INLINECONSTANT) { auto IC0 = CurrentIR.GetOp(Generator->Args[0]); auto IC1 = CurrentIR.GetOp(Generator->Args[1]); // We need the full 8 if the constant has upper bits set. Full = (IC0->Constant | IC1->Constant) & ~1; } } } else if (IROp->Op == OP_PARITY && !Full) { // Eliminate parity calculations if it's only 1-bit. auto Parity = IROp->C(); Ref Value = CurrentIR.GetNode(Parity->Raw); if (Parity->Invert) { IREmit->SetWriteCursor(CodeNode); Value = IREmit->_Xor(OpSize::i32Bit, Value, IREmit->_InlineConstant(1)); } IREmit->ReplaceUsesWithAfter(CodeNode, Value, CurrentIR.at(CodeNode)); IREmit->Remove(CodeNode); } } // Record our final state for our successors to read. CFG.Get(ID)->Flags = Full ? FULL : PARTIAL; } } void DeadFlagCalculationEliminination::Run(IREmitter* IREmit) { FEXCORE_PROFILE_SCOPED("PassManager::DFE"); auto CurrentIR = IREmit->ViewIR(); fextl::deque Worklist; // Initialize CFG ControlFlowGraph CFG {.IR = CurrentIR}; CFG.Init(Worklist, CurrentIR.GetHeader()->BlockCount); // Gather CFG for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { auto Block = BlockHeader->C(); auto CodeLast = CurrentIR.at(Block->Last); --CodeLast; auto [ExitNode, ExitOp] = CodeLast(); if (ExitOp->Op == IR::OP_CONDJUMP) { auto Op = ExitOp->CW(); CFG.RecordEdge(Block->ID, Op->TrueBlock); CFG.RecordEdge(Block->ID, Op->FalseBlock); } else if (ExitOp->Op == IR::OP_JUMP) { CFG.RecordEdge(Block->ID, ExitOp->Args[0]); } CFG.Get(Block->ID)->Node = BlockNode; } // After processing a block, if we made progress, we must process its // predecessors to propagate globally. A block will be reprocessed only if // there is a loop backedge. for (; !Worklist.empty(); Worklist.pop_back()) { auto Block = Worklist.back(); auto Info = CFG.Get(Block); Info->InWorklist = false; if (ProcessBlock(IREmit, CurrentIR, Info->Node, CFG)) { for (auto Pred : Info->Predecessors) { CFG.AddWorklist(Worklist, Pred); } } } // Fold compares into branches now that we're otherwise optimized. This needs // to run after eliminating carries etc and it needs the global flag metadata. // But it only needs to run once, we don't do it in the loop. for (auto [Block, _] : CurrentIR.GetBlocks()) { // Grab the jump auto BlockIROp = CurrentIR.GetOp(Block); auto CodeLast = CurrentIR.at(BlockIROp->Last); --CodeLast; auto [ExitNode, ExitOp] = CodeLast(); if (ExitOp->Op == IR::OP_CONDJUMP) { auto Op = ExitOp->CW(); uint32_t FlagsOut = CFG.Get(Op->TrueBlock)->Flags | CFG.Get(Op->FalseBlock)->Flags; if ((FlagsOut & FLAG_NZCV) == 0 && Op->FromNZCV) { FoldBranch(IREmit, CurrentIR, Op, ExitNode); } } } if (CurrentIR.GetHeader()->ReadsParity) { OptimizeParity(IREmit, CurrentIR, CFG); } } fextl::unique_ptr CreateDeadFlagCalculationEliminination() { return fextl::make_unique(); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: ir|opts $end_info$ */ #include "Interface/IR/Passes/RegisterAllocationPass.h" #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/RegisterAllocationData.h" #include "Interface/IR/Passes.h" #include "Interface/Core/CPUID.h" #include #include #include #include #include #include #include using namespace FEXCore; namespace FEXCore::IR { namespace { struct RegisterClassData { uint32_t Available; uint32_t Count; // If bit R of Available is 0, then RegToSSA[R] is the node currently // allocated to R. Else, RegToSSA[R] is UNDEFINED, no need to clear this // when freeing registers. Ref RegToSSA[32]; }; IR::RegClass GetRegClassFromNode(IR::IRListView* IR, IR::IROp_Header* IROp) { const auto Class = IR::GetRegClass(IROp->Op); if (Class != IR::RegClass::Complex) { return Class; } // Complex register class handling switch (IROp->Op) { case IR::OP_LOADCONTEXT: return IROp->C()->Class; case IR::OP_LOADREGISTER: return IROp->C()->Class; case IR::OP_LOADCONTEXTINDEXED: return IROp->C()->Class; case IR::OP_LOADMEM: case IR::OP_LOADMEMTSO: return IROp->C()->Class; case IR::OP_FILLREGISTER: return IROp->C()->Class; default: return IR::RegClass::Invalid; } }; } // Anonymous namespace class ConstrainedRAPass final : public RegisterAllocationPass { public: explicit ConstrainedRAPass(const FEXCore::CPUIDEmu* CPUID) : CPUID {CPUID} {} void Run(IREmitter* IREmit) override; void AddRegisters(IR::RegClass Class, uint32_t RegisterCount) override; bool TryPostRAMerge(Ref LastNode, Ref CodeNode, IROp_Header* IROp); private: RegisterClassData Classes[IR::NumClasses]; IREmitter* IREmit {}; IRListView* IR {}; const FEXCore::CPUIDEmu* CPUID {}; // Map of nodes to their preferred register, to coalesce load/store reg. fextl::vector PreferredReg; // Map of assigned registers. Does not grow beyond the initial set. fextl::vector SSAToReg; // Maps defs to their assigned spill slot + 1, or 0 if not spilled. fextl::vector SpillSlots; // Next-use distance relative to the block end of each source, last first. fextl::vector SourcesNextUses; // Sources that have been seen fextl::vector Seen; // SourcesNextUses is read backwards, this tracks the index int64_t SourceIndex {}; bool Rematerializable(IROp_Header* IROp) { return IROp->Op == OP_CONSTANT; } Ref InsertFill(Ref Node) { IROp_Header* IROp = IR->GetOp(Node); // Remat if we can if (Rematerializable(IROp)) { const auto Op = IROp->C(); uint64_t Const = Op->Constant; return IREmit->_Constant(Const, Op->Pad, Op->MaxBytes); } // Otherwise fill from stack uint32_t SlotPlusOne = SpillSlots[IR->GetID(Node).Value]; LOGMAN_THROW_A_FMT(SlotPlusOne >= 1, "Node must have been spilled"); const auto RegClass = GetRegClassFromNode(IR, IROp); return IREmit->_FillRegister(IROp->Size, IROp->ElementSize, SlotPlusOne - 1, RegClass); }; // IP of next-use of each source. IPs are measured from the end of the // block, so we don't need to size the block up-front. fextl::vector NextUses; bool AnySpilled {}; bool IsValidArg(OrderedNodeWrapper Arg) { if (Arg.IsInvalid()) { return false; } auto Op = IR->GetOp(Arg)->Op; return Op != OP_INLINECONSTANT && Op != OP_INLINEENTRYPOINTOFFSET; }; RegisterClassData* GetClass(PhysicalRegister Reg) { return &Classes[Reg.Class]; }; uint32_t GetRegBits(PhysicalRegister Reg) { return 1 << Reg.Reg; }; bool IsInRegisterFile(Ref Node) { auto ID = IR->GetID(Node).Value; LOGMAN_THROW_A_FMT(ID < SSAToReg.size(), "Only old nodes looked up"); PhysicalRegister Reg = SSAToReg[ID]; RegisterClassData* Class = GetClass(Reg); return (Class->Available & GetRegBits(Reg)) == 0 && Class->RegToSSA[Reg.Reg] == Node; }; void FreeReg(PhysicalRegister Reg) { RegisterClassData* Class = GetClass(Reg); uint32_t RegBits = GetRegBits(Reg); LOGMAN_THROW_A_FMT(!(Class->Available & RegBits), "Register double-free"); Class->Available |= RegBits; }; bool HasSource(IROp_Header* I, PhysicalRegister Reg) { int NumArgs = IR::GetRAArgs(I->Op); for (int s = 0; s < NumArgs; ++s) { if (I->Args[s].IsImmediate()) { // When spilling for a destination, we'll see register sources if (PhysicalRegister(I->Args[s]) == Reg) { return true; } } else { // When spilling for SRA correctness, we'll see SSA sources. This is // pretty obscure. auto V = I->Args[s]; V.ClearKill(); if (IsValidArg(V) && SSAToReg[V.ID().Value] == Reg) { return true; } } } return false; }; Ref DecodeSRANode(const IROp_Header* IROp, Ref Node) { if (IROp->Op == OP_LOADREGISTER || IROp->Op == OP_LOADPF || IROp->Op == OP_LOADAF) { return Node; } else if (IROp->Op == OP_STOREREGISTER) { auto V = IROp->C()->Value; V.ClearKill(); return IR->GetNode(V); } else if (IROp->Op == OP_STOREPF || IROp->Op == OP_STOREAF) { auto V = IROp->C()->Value; V.ClearKill(); return IR->GetNode(V); } return nullptr; }; PhysicalRegister DecodeSRAReg(const IROp_Header* IROp, Ref Node) { uint8_t FlagOffset = Classes[FEXCore::ToUnderlying(RegClass::GPRFixed)].Count - 2; if (IROp->Op == OP_STOREREGISTER) { return PhysicalRegister(Node); } else if (IROp->Op == OP_LOADPF || IROp->Op == OP_STOREPF) { return PhysicalRegister {RegClass::GPRFixed, FlagOffset}; } else if (IROp->Op == OP_LOADAF || IROp->Op == OP_STOREAF) { return PhysicalRegister {RegClass::GPRFixed, uint8_t(FlagOffset + 1)}; } else { const IROp_LoadRegister* Op = IROp->C(); LOGMAN_THROW_A_FMT(Op->Class == RegClass::GPR || Op->Class == RegClass::FPR, "SRA classes"); if (Op->Class == RegClass::FPR) { return PhysicalRegister {RegClass::FPRFixed, uint8_t(Op->Reg)}; } else { return PhysicalRegister {RegClass::GPRFixed, uint8_t(Op->Reg)}; } } }; bool IsTrivial(Ref Node, const IROp_Header* Header) { switch (Header->Op) { case OP_ALLOCATEGPR: return true; case OP_ALLOCATEGPRAFTER: return true; case OP_ALLOCATEFPR: return true; case OP_RMWHANDLE: return PhysicalRegister(Node) == PhysicalRegister(Header->Args[0]); case OP_LOADREGISTER: return PhysicalRegister(Node) == DecodeSRAReg(Header, Node); case OP_STOREREGISTER: return PhysicalRegister(Header->Args[0]) == DecodeSRAReg(Header, Node); default: return false; } } // Helper macro to walk the set bits b in a 32-bit word x, using ffs to get // the next set bit and then clearing on each iteration. #define foreach_bit(b, x) for (uint32_t __x = (x), b; ((b) = __builtin_ffs(__x) - 1, __x); __x &= ~(1 << (b))) void CalculateNextUses(IROp_CodeBlock* BlockIROp, IROp_Header* Until) { SourcesNextUses.clear(); NextUses.resize(IR->GetSSACount(), 0); // IP relative to the end of the block. uint32_t IP = 1; // We grab these nodes this way so we can iterate easily auto CodeBegin = IR->at(BlockIROp->Begin); auto CodeLast = IR->at(BlockIROp->Last); while (1) { auto [CodeNode, IROp] = CodeLast(); if (IROp == Until) { break; } // End of iteration gunk const int NumArgs = IR::GetRAArgs(IROp->Op); for (int i = NumArgs - 1; i >= 0; --i) { auto V = IROp->Args[i]; V.ClearKill(); if (IsValidArg(V)) { const uint32_t Index = V.ID().Value; SourcesNextUses.push_back(NextUses[Index]); NextUses[Index] = IP; } } // IP is relative to block end and we iterate backwards, so increment. ++IP; // Rest is iteration gunk if (CodeLast == CodeBegin) { break; } --CodeLast; } SourceIndex = SourcesNextUses.size(); } void SpillReg(RegisterClassData* Class, IROp_CodeBlock* Block, IROp_Header* Exclude) { // We're about to use next-use information, so calculate it. if (!AnySpilled) { CalculateNextUses(Block, Exclude); } // Find the best node to spill according to the "furthest-first" heuristic. // Since we defined IPs relative to the end of the block, the furthest // next-use has the /smallest/ unsigned IP. Ref Candidate = nullptr; uint32_t BestDistance = UINT32_MAX; uint8_t BestReg = ~0; uint32_t Allocated = ((1u << Class->Count) - 1) & ~Class->Available; foreach_bit(i, Allocated) { Ref Node = Class->RegToSSA[i]; auto Reg = SSAToReg[IR->GetID(Node).Value]; LOGMAN_THROW_A_FMT(Node != nullptr, "Invariant3"); LOGMAN_THROW_A_FMT(Reg.Reg == i, "Invariant4"); // Skip any source used by the current instruction, it is unspillable. if (!HasSource(Exclude, Reg)) { uint32_t NextUse = NextUses[IR->GetID(Node).Value]; // Prioritize remat over spilling. It is typically cheaper to remat a // constant multiple times than to spill a single value. if (!Rematerializable(IR->GetOp(Node))) { NextUse += 100000; } if (NextUse < BestDistance) { BestDistance = NextUse; BestReg = i; Candidate = Node; } } } LOGMAN_THROW_A_FMT(Candidate != nullptr, "must've found something.."); PhysicalRegister Reg = SSAToReg[IR->GetID(Candidate).Value]; LOGMAN_THROW_A_FMT(Reg.Reg == BestReg, "Invariant6"); IROp_Header* Header = IR->GetOp(Candidate); uint32_t Value = IR->GetID(Candidate).Value; bool Spilled = !SpillSlots.empty() && SpillSlots[Value] != 0; // If we already spilled the Candidate, we don't need to spill again. // Similarly, if we can rematerialize the instruction, we don't spill it. if (!Spilled && Header->Op != OP_CONSTANT) { LOGMAN_THROW_A_FMT(Reg.AsRegClass() == GetRegClassFromNode(IR, Header), "Consistent"); // SpillSlots allocation is deferred. if (SpillSlots.empty()) { SpillSlots.resize(IR->GetSSACount(), 0); } // TODO: we should colour spill slots uint32_t Slot = IR->GetHeader()->SpillSlots++; // We must map here in case we're spilling something we shuffled. auto SpillOp = IREmit->_SpillRegister(OrderedNodeWrapper::FromImmediate(Reg.Raw), Slot, Reg.AsRegClass()); SpillOp.first->Header.Size = Header->Size; SpillOp.first->Header.ElementSize = Header->ElementSize; SpillSlots[Value] = Slot + 1; } // Now that we've spilled the value, take it out of the register file FreeReg(Reg); AnySpilled = true; }; void RemapReg(Ref Node, PhysicalRegister Reg) { RegisterClassData* Class = GetClass(Reg); Class->RegToSSA[Reg.Reg] = Node; uint32_t Index = IR->GetID(Node).Value; if (Index < SSAToReg.size()) { SSAToReg[Index] = Reg; } }; // Record a given assignment of register Reg to Node. void SetReg(Ref Node, PhysicalRegister Reg) { RegisterClassData* Class = GetClass(Reg); uint32_t RegBits = GetRegBits(Reg); LOGMAN_THROW_A_FMT((Class->Available & RegBits) == RegBits, "Precondition"); Class->Available &= ~RegBits; RemapReg(Node, Reg); Node->Reg = Reg.Raw; }; // Assign a register for a given Node, spilling if necessary. void AssignReg(IROp_Header* IROp, IROp_CodeBlock* Block, Ref CodeNode, IROp_Header* Pivot) { const uint32_t Node = IR->GetID(CodeNode).Value; // Prioritize preferred registers. if (Node < PreferredReg.size()) { if (PhysicalRegister Reg = PreferredReg[Node]; !Reg.IsInvalid()) { RegisterClassData* Class = GetClass(Reg); uint32_t RegBits = GetRegBits(Reg); if ((Class->Available & RegBits) == RegBits) { SetReg(CodeNode, Reg); return; } } } // Try to handle tied registers. This can fail, the JIT will insert moves. if (int TiedIdx = IR::TiedSource(IROp->Op); TiedIdx >= 0) { auto Reg = PhysicalRegister(IROp->Args[TiedIdx]); RegisterClassData* Class = GetClass(Reg); uint32_t RegBits = GetRegBits(Reg); if (Reg.AsRegClass() != RegClass::GPRFixed && Reg.AsRegClass() != RegClass::FPRFixed && (Class->Available & RegBits) == RegBits) { SetReg(CodeNode, Reg); return; } } // Try to coalesce reserved pairs. Just a heuristic to remove some moves. if (IROp->Op == OP_ALLOCATEGPR && IROp->C()->ForPair) { uint32_t Available = Classes[FEXCore::ToUnderlying(RegClass::GPR)].Available; // Only choose base register R if R and R + 1 are both free Available &= (Available >> 1); // Only consider aligned registers in the pair region constexpr uint32_t EVEN_BITS = 0x55555555; Available &= (EVEN_BITS & ((1u << PairRegs) - 1)); if (Available) { unsigned Reg = std::countr_zero(Available); SetReg(CodeNode, PhysicalRegister(RegClass::GPR, Reg)); return; } } else if (IROp->Op == OP_ALLOCATEGPRAFTER) { uint32_t Available = Classes[FEXCore::ToUnderlying(RegClass::GPR)].Available; auto After = PhysicalRegister(IROp->Args[0]); if ((After.Reg & 1) == 0 && Available & (1ull << (After.Reg + 1))) { SetReg(CodeNode, PhysicalRegister(RegClass::GPR, After.Reg + 1)); return; } } RegClass ClassType = GetRegClassFromNode(IR, IROp); RegisterClassData* Class = &Classes[FEXCore::ToUnderlying(ClassType)]; // Spill to make room in the register file. if (!Class->Available) { IREmit->SetWriteCursorBefore(CodeNode); SpillReg(Class, Block, Pivot); } // Assign a free register in the appropriate class. LOGMAN_THROW_A_FMT(Class->Available != 0, "Post-condition of spilling"); unsigned Reg = std::countr_zero(Class->Available); SetReg(CodeNode, PhysicalRegister(ClassType, Reg)); }; }; void ConstrainedRAPass::AddRegisters(IR::RegClass Class, uint32_t RegisterCount) { LOGMAN_THROW_A_FMT(RegisterCount <= 31, "Up to 31 regs supported"); Classes[FEXCore::ToUnderlying(Class)].Count = RegisterCount; } inline bool KillMove(IROp_Header* LastOp, IROp_Header* IROp, Ref LastNode, Ref CodeNode) { // 32-bit moves in x86_64 are represented as a Bfe, detect them. if (LastOp->Op == OP_BFE && LastOp->C()->lsb == 0 && LastOp->C()->Width == 32) { auto Op = IROp->Op; if (Op == OP_AND) { // Rewrite "mov wA, wB; and xA, xA, xC" into "and wA, wB, wC", since // ((b & 0xffffffff) & c) == (b & c) & 0xffffffff. IROp->Size = OpSize::i32Bit; return true; } else if (IROp->Size == OpSize::i32Bit) { return Op == OP_OR || Op == OP_XOR || Op == OP_AND || Op == OP_SUB || Op == OP_LSHL || Op == OP_LSHR || Op == OP_ASHR; } } return LastOp->Op == OP_STOREREGISTER; } inline bool IsSignext(const IROp_Header* IROp, OrderedNodeWrapper Src, OpSize Size) { if (IROp->Op == OP_SBFE) { auto Sbfe = IROp->C(); return Sbfe->Width == 1 && Sbfe->lsb == (IR::OpSizeAsBits(Size) - 1) && Sbfe->Src == Src; } else { return false; } } inline bool IsZero(const IROp_Header* IROp) { return IROp->Op == OP_CONSTANT && IROp->C()->Constant == 0; } bool ConstrainedRAPass::TryPostRAMerge(Ref LastNode, Ref CodeNode, IROp_Header* IROp) { auto LastOp = IR->GetOp(LastNode); if (IROp->Op == OP_PUSH && LastOp->Op == OP_PUSH) { auto SP = PhysicalRegister(CodeNode); auto Push = IR->GetOp(CodeNode); auto LastPush = IR->GetOp(LastNode); if (LastOp->Size == IROp->Size && LastPush->ValueSize == Push->ValueSize && SP == PhysicalRegister(LastNode) && SP == PhysicalRegister(IROp->Args[1]) && SP == PhysicalRegister(LastOp->Args[1]) && SP != PhysicalRegister(IROp->Args[0]) && SP != PhysicalRegister(LastOp->Args[0]) && Push->ValueSize >= OpSize::i32Bit) { IREmit->SetWriteCursorBefore(LastNode); IREmit->_PushTwo(IROp->Size, Push->ValueSize, IROp->Args[0], LastOp->Args[0], IROp->Args[1]); IREmit->RemovePostRA(CodeNode); return true; } } else if (IROp->Op == OP_POP) { auto SP = PhysicalRegister(IROp->Args[0]); if (LastOp->Op == OP_POP && LastOp->Size == IROp->Size && IROp->Size >= OpSize::i32Bit && SP == PhysicalRegister(LastOp->Args[0])) { IREmit->SetWriteCursorBefore(LastNode); IREmit->_PopTwo(IROp->Size, IROp->Args[0], LastOp->Args[1], IROp->Args[1]); IREmit->RemovePostRA(CodeNode); return true; } } else if ((IROp->Op == OP_DIV || IROp->Op == OP_UDIV) && IROp->Size >= OpSize::i32Bit) { // If Upper came from a sign/zero extension, we only need a 64-bit division. auto Op = IROp->CW(); if (!Op->Upper.IsInvalid() && PhysicalRegister(Op->Upper) == PhysicalRegister(LastNode)) { if (IROp->Op == OP_DIV ? IsSignext(LastOp, Op->Lower, IROp->Size) : IsZero(LastOp)) { Op->Upper.SetInvalid(); return PhysicalRegister(LastNode) == PhysicalRegister(Op->OutRemainder); } } } else if (IROp->Op == OP_XGETBV && PhysicalRegister(IROp->Args[0]) == PhysicalRegister(LastNode) && LastOp->Op == OP_CONSTANT) { // Try to constant fold uint64_t ConstantFunction = LastOp->C()->Constant; auto Op = IROp->CW(); if (CPUID->DoesXCRFunctionReportConstantData(ConstantFunction)) { const auto Result = CPUID->RunXCRFunction(ConstantFunction); IREmit->SetWriteCursorBefore(CodeNode); IREmit->_Constant(Result.eax).Node->Reg = PhysicalRegister(Op->OutEAX).Raw; IREmit->_Constant(Result.edx).Node->Reg = PhysicalRegister(Op->OutEDX).Raw; IREmit->RemovePostRA(CodeNode); return false; } } else if (IROp->Op == OP_CPUID && PhysicalRegister(IROp->Args[0]) == PhysicalRegister(LastNode) && LastOp->Op == OP_CONSTANT) { // Try to constant fold. As a limitation of merging only 2 instructions, we // can only handle constant functions, not constant leafs. This could be // lifted if we generalized at a (significant) complexity cost. uint64_t ConstantFunction = LastOp->C()->Constant; auto Op = IROp->CW(); const auto SupportsConstant = CPUID->DoesFunctionReportConstantData(ConstantFunction); if (SupportsConstant.SupportsConstantFunction == CPUIDEmu::SupportsConstant::CONSTANT && SupportsConstant.NeedsLeaf != CPUIDEmu::NeedsLeafConstant::NEEDSLEAFCONSTANT) { const auto Result = CPUID->RunFunction(ConstantFunction, 0 /* leaf */); IREmit->SetWriteCursorBefore(CodeNode); IREmit->_Fence(IR::FenceType::Inst); IREmit->_Constant(Result.eax).Node->Reg = PhysicalRegister(Op->OutEAX).Raw; IREmit->_Constant(Result.ebx).Node->Reg = PhysicalRegister(Op->OutEBX).Raw; IREmit->_Constant(Result.ecx).Node->Reg = PhysicalRegister(Op->OutECX).Raw; IREmit->_Constant(Result.edx).Node->Reg = PhysicalRegister(Op->OutEDX).Raw; IREmit->RemovePostRA(CodeNode); return false; } } // Merge moves that are immediately consumed. // // x86 code inserts such moves to workaround x86's 2-address code. Because // arm64 is 3-address code, we can optimize these out. // // Note we rely on the short-circuiting here. if (PhysicalRegister(LastNode) == PhysicalRegister(CodeNode) && KillMove(LastOp, IROp, LastNode, CodeNode)) { LOGMAN_THROW_A_FMT(!PhysicalRegister(CodeNode).IsInvalid(), "invariant"); int NumArgs = IR::GetRAArgs(IROp->Op); for (int s = 0; s < NumArgs; ++s) { if (IROp->Args[s].IsImmediate() && PhysicalRegister(IROp->Args[s]) == PhysicalRegister(LastNode)) { IROp->Args[s].SetImmediate(PhysicalRegister(LastOp->Args[0]).Raw); } } return true; } return false; } void ConstrainedRAPass::Run(IREmitter* IREmit_) { FEXCORE_PROFILE_SCOPED("PassManager::RA"); IREmit = IREmit_; auto IR_ = IREmit->ViewIR(); IR = &IR_; PreferredReg.resize(IR->GetSSACount(), PhysicalRegister::Invalid()); SSAToReg.resize(IR->GetSSACount(), PhysicalRegister::Invalid()); Seen.resize(IR->GetSSACount(), false); for (auto [BlockNode, BlockHeader] : IR->GetBlocks()) { // Spilling is local, so reset this per-block AnySpilled = false; // At the start of each block, all registers are available. for (auto& Class : Classes) { Class.Available = (1u << Class.Count) - 1; } auto BlockIROp = BlockHeader->CW(); // Backwards pass: analyze kill bits and SRA affinities { // Reverse iteration is not yet working with the iterators // We grab these nodes this way so we can iterate easily auto CodeBegin = IR->at(BlockIROp->Begin); auto CodeLast = IR->at(BlockIROp->Last); while (1) { auto [CodeNode, IROp] = CodeLast(); // End of iteration gunk // Record preferred registers for SRA. We also record the Node accessing // each register, used below. Since we initialized Class->Available, // RegToSSA is otherwise undefined so we can stash our temps there. if (auto Node = DecodeSRANode(IROp, CodeNode); Node != nullptr) { auto Reg = DecodeSRAReg(IROp, CodeNode); PreferredReg[IR->GetID(Node).Value] = Reg; GetClass(Reg)->RegToSSA[Reg.Reg] = CodeNode; } // Coalescing an SRA store is equivalent to hoisting the store, // implying write-after-write and read-after-write hazards. We can only // coalesce if there is no intervening load/store. // // Since we're walking backwards, RegToSSA tracks // the first load/store after CodeNode. That first instruction is the // store in question iff there is no intervening load/store. // // Reset PreferredReg if that is not the case, ensuring SRA correctness. if (auto Reg = PreferredReg[IR->GetID(CodeNode).Value]; !Reg.IsInvalid()) { auto Node = GetClass(Reg)->RegToSSA[Reg.Reg]; IROp_Header* Header = IR->GetOp(Node); if (CodeNode != DecodeSRANode(Header, Node)) { PreferredReg[IR->GetID(CodeNode).Value] = PhysicalRegister::Invalid(); } } const int NumArgs = IR::GetRAArgs(IROp->Op); for (int i = NumArgs - 1; i >= 0; --i) { const auto& Arg = IROp->Args[i]; if (!Arg.IsInvalid()) { const uint32_t Index = Arg.ID().Value; if (!Seen[Index]) { Seen[Index] = true; IROp->Args[i].SetKill(); } } } // Rest is iteration gunk if (CodeLast == CodeBegin) { break; } --CodeLast; } } // NextUses currently contains first use distances, the exact initialization // assumed by the forward pass. Do not reset it. // Last nontrivial instruction, for merging as we go. Ref LastNode = nullptr; // Forward pass: Assign registers, spilling & optimizing as we go. for (auto [CodeNode, IROp] : IR->GetCode(BlockNode)) { bool AnySpilledBeforeThisInstruction = AnySpilled; // These do not read or write registers, and must be skipped for merging. // Since we'd be doing this check anyway for merging, do the check now so // we can skip the rest of the logic too. if (IROp->Op == OP_GUESTOPCODE || IROp->Op == OP_INLINECONSTANT) { continue; } // Static registers must be consistent at SRA load/store. Evict to ensure. if (auto Node = DecodeSRANode(IROp, CodeNode); Node != nullptr) { auto Reg = DecodeSRAReg(IROp, CodeNode); RegisterClassData* Class = &Classes[Reg.Class]; if (!(Class->Available & (1u << Reg.Reg))) { Ref Old = Class->RegToSSA[Reg.Reg]; if (Old != Node) { // Before inserting instructions, we need to set the cursor and // reset LastNode so we don't merge across an inserted copy. // Otherwise, we would erroneously miss the copy when determining if // we can merge, and end up unsoundly merging a mov+xchg sequence. IREmit->SetWriteCursorBefore(CodeNode); LastNode = nullptr; Ref Copy; if (Reg.AsRegClass() == RegClass::FPRFixed) { IROp_Header* Header = IR->GetOp(Old); Copy = IREmit->_VMov(Header->Size, OrderedNodeWrapper::FromImmediate(Reg.Raw)); } else { Copy = IREmit->_Copy(OrderedNodeWrapper::FromImmediate(Reg.Raw)); } FreeReg(Reg); AssignReg(IR->GetOp(Copy), BlockIROp, Copy, IROp); RemapReg(Old, PhysicalRegister(Copy)); } } } // Fill all sources that are not already in the register file. // // This happens before freeing killed sources, since we need all sources in // the register file simultaneously. // // Also update next-use info, again only relevant if we've spilled. int NumArgs = IR::GetRAArgs(IROp->Op); if (AnySpilledBeforeThisInstruction) { for (int s = 0; s < NumArgs; ++s) { auto V = IROp->Args[s]; V.ClearKill(); if (!IsValidArg(V)) { continue; } Ref Old = IR->GetNode(V); SourceIndex--; LOGMAN_THROW_A_FMT(SourceIndex >= 0, "Consistent source count"); NextUses[V.ID().Value] = SourcesNextUses[SourceIndex]; if (!IsInRegisterFile(Old)) { IREmit->SetWriteCursorBefore(CodeNode); LastNode = nullptr; Ref Fill = InsertFill(Old); AssignReg(IR->GetOp(Fill), BlockIROp, Fill, IROp); RemapReg(Old, PhysicalRegister(Fill)); } } } for (int s = 0; s < NumArgs; ++s) { if (IROp->Args[s].IsInvalid()) { continue; } bool Kill = IROp->Args[s].HasKill(); IROp->Args[s].ClearKill(); Ref Node = IR->GetNode(IROp->Args[s]); auto ID = IR->GetID(Node).Value; auto Reg = SSAToReg[ID]; if (!Reg.IsInvalid()) { if (Kill) { LOGMAN_THROW_A_FMT(IsInRegisterFile(Node), "sources in file"); FreeReg(Reg); } IROp->Args[s].SetImmediate(Reg.Raw); } } // Assign destinations. if (GetHasDest(IROp->Op) && PhysicalRegister(CodeNode).IsInvalid()) { AssignReg(IROp, BlockIROp, CodeNode, IROp); } if (IsTrivial(CodeNode, IROp)) { // Delete instructions that only exist for RA IREmit->RemovePostRA(CodeNode); } else if (LastNode && TryPostRAMerge(LastNode, CodeNode, IROp)) { // Merge adjacent instructions IREmit->RemovePostRA(LastNode); LastNode = nullptr; } else { LastNode = CodeNode; } } if (AnySpilled) { LOGMAN_THROW_A_FMT(SourceIndex == 0, "Consistent source count in block"); } } PreferredReg.clear(); SSAToReg.clear(); SpillSlots.clear(); NextUses.clear(); Seen.clear(); IR->GetHeader()->PostRA = true; } fextl::unique_ptr CreateRegisterAllocationPass(const FEXCore::CPUIDEmu* CPUID) { return fextl::make_unique(CPUID); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.h ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: ir|opts $end_info$ */ #pragma once #include "Interface/IR/PassManager.h" #include #include namespace FEXCore::IR { enum class RegClass : uint32_t; class RegisterAllocationPass : public FEXCore::IR::Pass { public: virtual void AddRegisters(RegClass Class, uint32_t RegisterCount) = 0; // Number of GPRs usable for pairs at start of GPR set. Must be even. uint32_t PairRegs {}; }; } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp ================================================ // SPDX-License-Identifier: MIT #include "FEXCore/Utils/LogManager.h" #include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h" #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" #include "FEXCore/IR/IR.h" #include "FEXCore/Utils/Profiler.h" #include "FEXCore/Core/HostFeatures.h" #include "Interface/Core/Addressing.h" #include #include #include #include // This file adds a pass to process X87 stack instructions. // These instructions are marked in IR.json with `X87: true` and are generated // by X87 guest instructions. // The way is works is that there's a virtual stack `StackData`, where we load and store // and apply the operations in a block of code. Once the block finishes, we emit the necessary operations // that we recorded onto the virtual stack. This allows us to save a lot of code movement // to and from stack registers, top management and valid flags. It also allows us to // perform memcpy optimizations like the one performed in STORESTACKMEM. // // By default we run on the fast path - i.e. we assume all values are in the stack and we have a complete // stack overview. However, if we encounter a value that's not in the virtual stack - maybe it was added // to the stack in a previous block, we move onto the slow path which loads and stores values to the stack // registers. // Once in a slow path, we won't return to the fast pass until the beginning of the following block. namespace FEXCore::IR { // FIXME(pmatos): copy from OpcodeDispatcher.h inline uint32_t MMBaseOffset() { return static_cast(offsetof(Core::CPUState, mm[0][0])); } // Similar helper to the one in OpcodeDispatcher.h except we do not // need to handle flags, etc. template void DeriveOp(Ref& RefV, IROps NewOp, IREmitter::IRPair Expr) { Expr.first->Header.Op = NewOp; RefV = Expr; } enum class StackSlot { UNUSED, INVALID, VALID }; // FixedSizeStack is a model of the x87 Stack where each element in this // fixed size stack lives at an offset from top. The top of the stack is at // index 0. template class FixedSizeStack { public: struct StackSlotEntry final { StackSlot Type; T Value; }; static constexpr uint8_t size = 8; // Real top as an offset from stored top value (or the one at the beginning of the block) // For example, if we start and push a value to our simulated stack, because we don't // update top straight away the TopOffset is 1. // If SlowPath is true, then TopOffset is always zero. int8_t TopOffset = 0; FixedSizeStack() : buffer(FixedSizeStack::size, {StackSlot::UNUSED, T::Invalid}) {} void push(const T& Value) { rotate(); buffer.front() = {StackSlot::VALID, Value}; } // Rotate the elements with the direction controlled by Right void rotate(bool Right = true) { if (Right) { std::rotate(buffer.begin(), buffer.end() - 1, buffer.end()); TopOffset++; } else { std::rotate(buffer.begin(), buffer.begin() + 1, buffer.end()); TopOffset--; } } void pop() { buffer.front() = {StackSlot::INVALID, T::Invalid}; rotate(false); } const StackSlotEntry& top(size_t Offset = 0) const { return buffer[Offset]; } void setTop(T Value, size_t Offset = 0) { buffer[Offset] = {StackSlot::VALID, Value}; } bool isValid(size_t Offset) const { return buffer[Offset].first; } void clear() { for (auto& Elem : buffer) { Elem = {StackSlot::UNUSED, T::Invalid}; } TopOffset = 0; } void dump() const { LogMan::Msg::DFmt("-- Stack"); for (size_t i = 0; i < 8; i++) { const auto& [Valid, Element] = buffer[i]; if (Valid == StackSlot::VALID) { LogMan::Msg::DFmt("| ST{}: 0x{:x}", i, (uintptr_t)(Element.StackDataNode)); } else if (Valid == StackSlot::INVALID) { LogMan::Msg::DFmt("| ST{}: INVALID", i); } } LogMan::Msg::DFmt("--"); } void setTagInvalid(size_t Index) { buffer[Index].Type = StackSlot::INVALID; } // Returns a mask to set in AbridgedTagWord uint8_t getValidMask() { uint8_t Mask = 0; for (size_t i = 0; i < buffer.size(); i++) { if (buffer[i].Type == StackSlot::VALID) { Mask |= 1U << i; } } return Mask; } // Returns a mask to set in AbridgedTagWord uint8_t getInvalidMask() { uint8_t Mask = 0; for (size_t i = 0; i < buffer.size(); i++) { if (buffer[i].Type == StackSlot::INVALID) { Mask |= 1U << i; } } return Mask; } private: fextl::vector buffer; }; class X87StackOptimization final : public Pass { public: X87StackOptimization(const FEXCore::HostFeatures& Features, OpSize GPROpSize) : Features(Features) , GPROpSize(GPROpSize) { FEX_CONFIG_OPT(ReducedPrecision, X87REDUCEDPRECISION); ReducedPrecisionMode = ReducedPrecision; } void Run(IREmitter* Emit) override; private: const FEXCore::HostFeatures& Features; const OpSize GPROpSize; bool ReducedPrecisionMode; FEX_CONFIG_OPT(DisableVixlIndirectCalls, DISABLE_VIXL_INDIRECT_RUNTIME_CALLS); // Helpers Ref RotateRight8(uint32_t V, Ref Amount); void F80SplitStore_Helper(const IROp_StoreStackMem* Op, Ref StackNode, Ref AddrNode, Ref Offset, OpSize Align, MemOffsetType OffsetType, uint8_t OffsetScale) { IREmit->_StoreMemFPR(OpSize::i64Bit, StackNode, AddrNode, Offset, Align, OffsetType, OffsetScale); auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1); // Store the Upper part of the register (the remaining 2 bytes) into memory. AddressMode A {.Base = AddrNode, .Index = Op->Offset.IsInvalid() ? nullptr : Offset, .Offset = 8, .IndexType = MemOffsetType::SXTX, .IndexScale = OffsetScale, .AddrSize = OpSize::i64Bit}; A = SelectAddressMode(IREmit, A, GPROpSize, Features.SupportsTSOImm9, false, false, OpSize::i16Bit); IREmit->_StoreMemGPR(OpSize::i16Bit, Upper, A.Base, A.Index, OpSize::i64Bit, MemOffsetType::SXTX, A.IndexScale); } void Store80BitToMem(const IROp_StoreStackMem* Op, Ref StackNode, Ref AddrNode, Ref Offset, OpSize Align, MemOffsetType OffsetType, uint8_t OffsetScale) { if (Features.SupportsSVE128 || Features.SupportsSVE256) { AddressMode A {.Base = AddrNode, .Index = Op->Offset.IsInvalid() ? nullptr : Offset, .IndexType = MemOffsetType::SXTX, .IndexScale = OffsetScale, .AddrSize = OpSize::i64Bit}; AddrNode = LoadEffectiveAddress(IREmit, A, GPROpSize, false); IREmit->_StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, AddrNode); } else { F80SplitStore_Helper(Op, StackNode, AddrNode, Offset, Align, OffsetType, OffsetScale); } } void StoreStackMem_Helper(const IROp_StoreStackMem* Op, Ref StackNode) { LOGMAN_THROW_A_FMT(!ReducedPrecisionMode, "Full precision mode expected."); Ref AddrNode = IR->GetNode(Op->Addr); Ref Offset = IR->GetNode(Op->Offset); OpSize Align = Op->Align; MemOffsetType OffsetType = Op->OffsetType; uint8_t OffsetScale = Op->OffsetScale; // Normal Precision Mode switch (Op->StoreSize) { case OpSize::i32Bit: case OpSize::i64Bit: { StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); IREmit->_StoreMemFPR(Op->StoreSize, StackNode, AddrNode, Offset, Align, OffsetType, OffsetScale); break; } case OpSize::f80Bit: { Store80BitToMem(Op, StackNode, AddrNode, Offset, Align, OffsetType, OffsetScale); break; } default: ERROR_AND_DIE_FMT("Unsupported x87 size"); } } // Performs a store to memory from a value the stack passed in as StackNode. // This is the version dealing with the reduced precision case. void StoreStackMem_Reduced_Helper(const IROp_StoreStackMem* Op, Ref StackNode) { LOGMAN_THROW_A_FMT(ReducedPrecisionMode, "Reduced precision mode expected."); Ref AddrNode = IR->GetNode(Op->Addr); Ref Offset = IR->GetNode(Op->Offset); OpSize Align = Op->Align; MemOffsetType OffsetType = Op->OffsetType; uint8_t OffsetScale = Op->OffsetScale; switch (Op->StoreSize) { case OpSize::i32Bit: { StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode); [[fallthrough]]; } case OpSize::i64Bit: { IREmit->_StoreMemFPR(Op->StoreSize, StackNode, AddrNode, Offset, Align, OffsetType, OffsetScale); break; } case OpSize::f80Bit: { StackNode = IREmit->_F80CVTTo(StackNode, OpSize::i64Bit); Store80BitToMem(Op, StackNode, AddrNode, Offset, Align, OffsetType, OffsetScale); break; } default: ERROR_AND_DIE_FMT("Unsupported x87 size"); } } // Handles a Unary operation. // Takes the op we are handling, the Node for the reduced precision case and the node for the normal case. // Depending on the type of Op64, we might need to pass a couple of extra constant arguments, this happens // when VFOp64 is true. void HandleUnop(IROps Op64, bool VFOp64, IROps Op80); void HandleBinopValue(IROps Op64, bool VFOp64, IROps Op80, uint8_t DestStackOffset, bool MarkDestValid, uint8_t StackOffset, Ref ValueNode, bool Reverse = false); void HandleBinopStack(IROps Op64, bool VFOp64, IROps Op80, uint8_t DestStackOffset, uint8_t StackOffset1, uint8_t StackOffset2, bool Reverse = false); // Top Management Helpers /// Set the valid tag for Value as valid (if Valid is true), or invalid (if Valid is false). void SetX87ValidTag(uint8_t Offset, bool Valid); // Generates slow code to load/store a value from an offset from the top of the stack Ref LoadStackValueAtOffset_Slow(uint8_t Offset = 0); void StoreStackValueAtOffset_Slow(Ref Value, uint8_t Offset = 0, bool SetValid = true); // Update Top value in slow path for a pop void UpdateTopForPop_Slow(); void UpdateTopForPush_Slow(); // Synchronizes the current simulated stack with the actual values. // Returns a new value for Top, that's synchronized between the simulated stack // and the actual FPU stack. Ref SynchronizeStackValues(); // Moves us from the fast to the slow path if ShouldMigrate is true. void MigrateToSlowPathIf(bool ShouldMigrate); // Top Cache Management Ref GetTopWithCache_Slow(); Ref GetOffsetTopWithCache_Slow(uint8_t Offset, bool Reverse = false); Ref GetOffsetTopAddressWithCache_Slow(uint8_t Offset); void SetTopWithCache_Slow(Ref Value); Ref GetX87ValidTag_Slow(uint8_t Offset); // Resets fields to initial values void Reset(); struct StackMemberInfo { StackMemberInfo() = delete; StackMemberInfo(Ref Data) : StackDataNode(Data) {} StackMemberInfo(Ref Data, Ref Source, OpSize Size) : StackDataNode(Data) , Source({Size, Source}) {} Ref StackDataNode {}; // Reference to the data in the Stack. // This is the source data node in the stack format, possibly converted to 64/80 bits. struct StackMemberData final { OpSize Size; Ref Node; }; static const StackMemberInfo Invalid; // Tuple is only valid if we have information about the Source of the Stack Data Node. // In it's valid then OpSize is the original source size and Ref is the original source node. std::optional Source {}; }; // StackData, TopCache need to be always properly set to ensure // they reflect the current state of the FPU. This sync only makes sense while // taking the fast path. Once in the slow path, these don't make sense anymore // and we are syncing everything. // Index on vector is offset to top value at start of block // If slow path is true, then StackData is always empty. FixedSizeStack StackData; void InvalidateCaches(); void InvalidateCachedRegs(); // Path Migration helper management std::optional MigrateToSlowPath_IfInvalid(uint8_t Offset = 0); Ref LoadStackValue(uint8_t Offset = 0); void StoreStackValue(Ref Value, uint8_t Offset = 0, bool SetValid = false); void StackPop(); // Cache for Constants // ConstantPoll[i] has IREmit->_Constant(i); std::array ConstantPool {}; Ref GetConstant(ssize_t Offset); // Cached value for Top // If slowpath is false, then TopCache is nullptr. bool FlushTopPending = false; std::array FlushValuesPending {}; bool FlushValidPending = false; void FlushCachedRegs(); Ref GetFTW(); Ref FTWCached {}; std::array TopOffsetCache {}; std::array TopOffsetAddressCache {}; std::array TopValueCache {}; std::array TopValidCache {}; // Are we on the slow path? // Once we enter the slow path, we never come out. // This just simplifies the code atm. If there's a need to return to the fast path in the future // we can implement that but I would expect that there would be very few cases where that's necessary. // On the slow path TopCache is always the last obtained version of top. // TopOffset is ignored bool SlowPath = false; // Keeping IREmitter not to pass arguments around IREmitter* IREmit = nullptr; IRListView* IR = nullptr; }; inline const X87StackOptimization::StackMemberInfo X87StackOptimization::StackMemberInfo::Invalid {nullptr}; inline void X87StackOptimization::InvalidateCaches() { InvalidateCachedRegs(); ConstantPool.fill(nullptr); } inline void X87StackOptimization::InvalidateCachedRegs() { FlushCachedRegs(); FTWCached = {}; TopOffsetCache.fill(nullptr); TopOffsetAddressCache.fill(nullptr); TopValueCache.fill(nullptr); TopValidCache.fill(StackSlot::UNUSED); } inline void X87StackOptimization::Reset() { SlowPath = false; StackData.clear(); InvalidateCaches(); } inline Ref X87StackOptimization::GetConstant(ssize_t Offset) { if (Offset < 0 || Offset >= X87StackOptimization::ConstantPool.size()) { // not dealt by pool return IREmit->_Constant(Offset); } if (ConstantPool[Offset] == nullptr) { ConstantPool[Offset] = IREmit->_Constant(Offset); } return ConstantPool[Offset]; } inline void X87StackOptimization::MigrateToSlowPathIf(bool ShouldMigrate) { if (ShouldMigrate && !SlowPath) { SynchronizeStackValues(); StackData.clear(); SlowPath = true; } } inline Ref X87StackOptimization::GetTopWithCache_Slow() { if (!TopOffsetCache[0]) { TopOffsetCache[0] = IREmit->_LoadContextGPR(OpSize::i8Bit, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); } return TopOffsetCache[0]; } inline Ref X87StackOptimization::GetOffsetTopWithCache_Slow(uint8_t Offset, bool Reverse) { if (Reverse) { Offset = 8 - Offset; } Offset &= 7; if (TopOffsetCache[Offset]) { return TopOffsetCache[Offset]; } auto* OffsetTop = GetTopWithCache_Slow(); if (Offset != 0) { OffsetTop = IREmit->_And(OpSize::i32Bit, IREmit->Add(OpSize::i32Bit, OffsetTop, Offset), GetConstant(7)); // GetTopWithCache_Slow already sets the cache so we don't need to set it here for offset == 0 TopOffsetCache[Offset] = OffsetTop; } return OffsetTop; } inline Ref X87StackOptimization::GetOffsetTopAddressWithCache_Slow(uint8_t Offset) { if (TopOffsetAddressCache[Offset]) { return TopOffsetAddressCache[Offset]; } Ref OffsetRef = GetOffsetTopWithCache_Slow(Offset); TopOffsetAddressCache[Offset] = IREmit->_FormContextAddress(OpSize::i64Bit, OffsetRef, 16); return TopOffsetAddressCache[Offset]; } inline void X87StackOptimization::SetTopWithCache_Slow(Ref Value) { InvalidateCachedRegs(); TopOffsetCache[0] = Value; FlushTopPending = true; } inline Ref X87StackOptimization::GetFTW() { if (!FTWCached) { FTWCached = IREmit->_LoadContextGPR(OpSize::i8Bit, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); } return FTWCached; } inline void X87StackOptimization::SetX87ValidTag(uint8_t Offset, bool Valid) { TopValidCache[Offset] = Valid ? StackSlot::VALID : StackSlot::INVALID; FlushValidPending = true; } inline Ref X87StackOptimization::GetX87ValidTag_Slow(uint8_t Offset) { switch (TopValidCache[Offset]) { case StackSlot::UNUSED: return IREmit->_And(OpSize::i32Bit, IREmit->_Lshr(OpSize::i32Bit, GetFTW(), GetOffsetTopWithCache_Slow(Offset)), GetConstant(1)); case StackSlot::INVALID: return GetConstant(0); case StackSlot::VALID: return GetConstant(1); } } inline Ref X87StackOptimization::LoadStackValueAtOffset_Slow(uint8_t Offset) { OrderedNode* TopOffsetAddress = GetOffsetTopAddressWithCache_Slow(Offset); auto Size = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; if (!TopValueCache[Offset]) { TopValueCache[Offset] = IREmit->_LoadMemFPR(Size, TopOffsetAddress, IREmit->_InlineConstant(MMBaseOffset()), Size, MemOffsetType::SXTX, 1); } return TopValueCache[Offset]; } inline void X87StackOptimization::StoreStackValueAtOffset_Slow(Ref Value, uint8_t Offset, bool SetValid) { TopValueCache[Offset] = Value; FlushValuesPending[Offset] = true; // mark it valid // In some cases we might already know it has been previously set as valid so we don't need to do it again if (SetValid) { SetX87ValidTag(Offset, true); } } inline Ref X87StackOptimization::RotateRight8(uint32_t V, Ref Amount) { return IREmit->_Lshr(OpSize::i32Bit, GetConstant(V | (V << 8)), Amount); } inline std::optional X87StackOptimization::MigrateToSlowPath_IfInvalid(uint8_t Offset) { const auto& [Valid, StackMember] = StackData.top(Offset); MigrateToSlowPathIf(Valid != StackSlot::VALID); if (Valid == StackSlot::VALID) { return StackMember; } return {}; } inline Ref X87StackOptimization::LoadStackValue(uint8_t Offset) { const auto& StackValue = MigrateToSlowPath_IfInvalid(Offset); return SlowPath ? LoadStackValueAtOffset_Slow(Offset) : StackValue->StackDataNode; } inline void X87StackOptimization::StoreStackValue(Ref Value, uint8_t Offset, bool SetValid) { if (SlowPath) { StoreStackValueAtOffset_Slow(Value, Offset, SetValid); } else { StackData.setTop(StackMemberInfo {Value}, Offset); } } inline void X87StackOptimization::StackPop() { if (SlowPath) { UpdateTopForPop_Slow(); } else { StackData.pop(); } } void X87StackOptimization::HandleUnop(IROps Op64, bool VFOp64, IROps Op80) { Ref St0 = LoadStackValue(); Ref Value {}; if (ReducedPrecisionMode) { if (VFOp64) { DeriveOp(Value, Op64, IREmit->_VFSqrt(OpSize::i64Bit, OpSize::i64Bit, St0)); } else { DeriveOp(Value, Op64, IREmit->_F64SIN(St0)); } } else { DeriveOp(Value, Op80, IREmit->_F80SQRT(St0)); } StoreStackValue(Value); } void X87StackOptimization::HandleBinopValue(IROps Op64, bool VFOp64, IROps Op80, uint8_t DestStackOffset, bool MarkDestValid, uint8_t StackOffset, Ref ValueNode, bool Reverse) { LOGMAN_THROW_A_FMT(!Reverse || VFOp64, "There are no reverse operations using non VFOp64 ops"); auto StackNode = LoadStackValue(StackOffset); Ref Node = {}; if (ReducedPrecisionMode) { if (Reverse) { DeriveOp(Node, Op64, IREmit->_VFAdd(OpSize::i64Bit, OpSize::i64Bit, ValueNode, StackNode)); } else { if (VFOp64) { DeriveOp(Node, Op64, IREmit->_VFAdd(OpSize::i64Bit, OpSize::i64Bit, StackNode, ValueNode)); } else { DeriveOp(Node, Op64, IREmit->_F64FPREM(StackNode, ValueNode)); } } } else { if (Reverse) { DeriveOp(Node, Op80, IREmit->_F80Add(ValueNode, StackNode)); } else { DeriveOp(Node, Op80, IREmit->_F80Add(StackNode, ValueNode)); } } StoreStackValue(Node, DestStackOffset, MarkDestValid && StackOffset != DestStackOffset); } void X87StackOptimization::HandleBinopStack(IROps Op64, bool VFOp64, IROps Op80, uint8_t DestStackOffset, uint8_t StackOffset1, uint8_t StackOffset2, bool Reverse) { auto StackNode = LoadStackValue(StackOffset2); HandleBinopValue(Op64, VFOp64, Op80, DestStackOffset, StackOffset2 != DestStackOffset, StackOffset1, StackNode, Reverse); } inline void X87StackOptimization::UpdateTopForPop_Slow() { // Pop the top of the x87 stack GetOffsetTopWithCache_Slow(1); std::rotate(TopOffsetCache.begin(), std::next(TopOffsetCache.begin()), TopOffsetCache.end()); std::rotate(TopOffsetAddressCache.begin(), std::next(TopOffsetAddressCache.begin()), TopOffsetAddressCache.end()); std::rotate(TopValueCache.begin(), std::next(TopValueCache.begin()), TopValueCache.end()); std::rotate(FlushValuesPending.begin(), std::next(FlushValuesPending.begin()), FlushValuesPending.end()); std::rotate(TopValidCache.begin(), std::next(TopValidCache.begin()), TopValidCache.end()); FlushTopPending = true; } inline void X87StackOptimization::UpdateTopForPush_Slow() { // Pop the top of the x87 stack GetOffsetTopWithCache_Slow(1, true); std::rotate(TopOffsetCache.begin(), std::prev(TopOffsetCache.end()), TopOffsetCache.end()); std::rotate(TopOffsetAddressCache.begin(), std::prev(TopOffsetAddressCache.end()), TopOffsetAddressCache.end()); std::rotate(TopValueCache.begin(), std::prev(TopValueCache.end()), TopValueCache.end()); std::rotate(FlushValuesPending.begin(), std::prev(FlushValuesPending.end()), FlushValuesPending.end()); std::rotate(TopValidCache.begin(), std::prev(TopValidCache.end()), TopValidCache.end()); FlushTopPending = true; } void X87StackOptimization::FlushCachedRegs() { if (FlushTopPending) { IREmit->_StoreContextGPR(OpSize::i8Bit, TopOffsetCache[0], offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); FlushTopPending = false; } auto Size = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; for (size_t i = 0; i < FlushValuesPending.size(); i++) { if (FlushValuesPending[i]) { OrderedNode* TopOffsetAddress = GetOffsetTopAddressWithCache_Slow(i); IREmit->_StoreMemFPR(Size, TopValueCache[i], TopOffsetAddress, IREmit->_InlineConstant(MMBaseOffset()), Size, MemOffsetType::SXTX, 1); // store FlushValuesPending[i] = false; } } if (FlushValidPending) { uint8_t ValidMask = 0; uint8_t InvalidMask = 0; for (auto It = TopValidCache.rbegin(); It != TopValidCache.rend(); It++) { ValidMask <<= 1; InvalidMask <<= 1; if (*It == StackSlot::VALID) { ValidMask |= 1; } else if (*It == StackSlot::INVALID) { InvalidMask |= 1; } } if (ValidMask || InvalidMask) { Ref NewFTW = [&]() { if (ValidMask == 0xff || InvalidMask == 0xff) { // If InvalidMask == 0xff then ValidMask = 0 return GetConstant(ValidMask); } else { Ref NewFTW = GetFTW(); Ref RotAmount {}; if (std::popcount(ValidMask) == 1) { uint8_t BitIdx = std::countr_zero(ValidMask); Ref RegMask = IREmit->_Lshl(OpSize::i32Bit, GetConstant(1), GetOffsetTopWithCache_Slow(BitIdx)); NewFTW = IREmit->_Or(OpSize::i32Bit, NewFTW, RegMask); } else if (ValidMask) { RotAmount = IREmit->_Sub(OpSize::i32Bit, GetConstant(8), GetTopWithCache_Slow()); // perform a rotate right on mask by top NewFTW = IREmit->_Or(OpSize::i32Bit, NewFTW, RotateRight8(ValidMask, RotAmount)); } if (std::popcount(InvalidMask) == 1) { uint8_t BitIdx = std::countr_zero(InvalidMask); Ref RegMask = IREmit->_Lshl(OpSize::i32Bit, GetConstant(1), GetOffsetTopWithCache_Slow(BitIdx)); NewFTW = IREmit->_Andn(OpSize::i32Bit, NewFTW, RegMask); } else if (InvalidMask) { if (!RotAmount) { RotAmount = IREmit->_Sub(OpSize::i32Bit, GetConstant(8), GetTopWithCache_Slow()); } NewFTW = IREmit->_Andn(OpSize::i32Bit, NewFTW, RotateRight8(InvalidMask, RotAmount)); } return NewFTW; } }(); IREmit->_StoreContextGPR(OpSize::i8Bit, NewFTW, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); FTWCached = NewFTW; } FlushValidPending = false; } } // We synchronize stack values in a few occasions but one of the most important of those, // is when we move from fast to a slow path and need to make sure that the context is properly // written. Ref X87StackOptimization::SynchronizeStackValues() { if (SlowPath) { return GetTopWithCache_Slow(); } // Store new top which is now the original top minus recorded top offset // Careful with underflow wraparound. const auto TopOffset = StackData.TopOffset; if (TopOffset != 0) { Ref NewTop = GetOffsetTopWithCache_Slow(TopOffset, true); SetTopWithCache_Slow(NewTop); } StackData.TopOffset = 0; // Before leaving we need to write the current values in the stack to // context so that the values are correct. Copy SourceDataNode in the // stack to the respective mmX register. Ref TopValue = GetTopWithCache_Slow(); for (size_t i = 0; i < StackData.size; ++i) { const auto& [Valid, StackMember] = StackData.top(i); if (Valid == StackSlot::VALID) { StoreStackValueAtOffset_Slow(StackMember.StackDataNode, i, false); } } { // Set valid tags uint8_t ValidMask = StackData.getValidMask(); uint8_t InvalidMask = StackData.getInvalidMask(); for (auto& Elem : TopValidCache) { Elem = (ValidMask & 1) ? StackSlot::VALID : ((InvalidMask & 1) ? StackSlot::INVALID : StackSlot::UNUSED); ValidMask >>= 1; InvalidMask >>= 1; } FlushValidPending = true; } return TopValue; } void X87StackOptimization::Run(IREmitter* Emit) { FEXCORE_PROFILE_SCOPED("PassManager::x87StackOpt"); auto CurrentIR = Emit->ViewIR(); auto* HeaderOp = CurrentIR.GetHeader(); LOGMAN_THROW_A_FMT(HeaderOp->Header.Op == OP_IRHEADER, "First op wasn't IRHeader"); if (!HeaderOp->HasX87) { // If there is no x87 in this, just early exit. return; } // Initialize IREmit member IREmit = Emit; IR = &CurrentIR; // Run optimization proper for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { auto BlockIROp = BlockHeader->CW(); // Each time we deal with a new block we need to start over. // The optimization should run per-block Reset(); IREmit->SetCurrentCodeBlock(BlockNode); for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { if (!LoweredX87(IROp->Op)) { continue; } IREmit->SetWriteCursor(CodeNode); switch (IROp->Op) { case OP_F80ADDSTACK: { const auto* Op = IROp->C(); HandleBinopStack(OP_VFADD, true, OP_F80ADD, Op->SrcStack1, Op->SrcStack1, Op->SrcStack2); break; } case OP_F80SUBSTACK: { const auto* Op = IROp->C(); HandleBinopStack(OP_VFSUB, true, OP_F80SUB, Op->DstStack, Op->SrcStack1, Op->SrcStack2); break; } case OP_F80MULSTACK: { const auto* Op = IROp->C(); HandleBinopStack(OP_VFMUL, true, OP_F80MUL, Op->SrcStack1, Op->SrcStack1, Op->SrcStack2); break; } case OP_F80DIVSTACK: { const auto* Op = IROp->C(); HandleBinopStack(OP_VFDIV, true, OP_F80DIV, Op->DstStack, Op->SrcStack1, Op->SrcStack2); break; } case OP_F80FPREMSTACK: { HandleBinopStack(OP_F64FPREM, false, OP_F80FPREM, 0, 0, 1); break; } case OP_F80FPREM1STACK: { HandleBinopStack(OP_F64FPREM1, false, OP_F80FPREM1, 0, 0, 1); break; } case OP_F80SCALESTACK: { HandleBinopStack(OP_F64SCALE, false, OP_F80SCALE, 0, 0, 1); break; } case OP_F80FYL2XSTACK: { HandleBinopStack(OP_F64FYL2X, false, OP_F80FYL2X, 1, 0, 1); StackPop(); break; } case OP_F80ATANSTACK: { HandleBinopStack(OP_F64ATAN, false, OP_F80ATAN, 1, 1, 0); StackPop(); break; } case OP_F80ADDVALUE: { const auto* Op = IROp->C(); HandleBinopValue(OP_VFADD, true, OP_F80ADD, 0, true, Op->SrcStack, CurrentIR.GetNode(Op->X80Src)); break; } case OP_F80SUBRVALUE: case OP_F80SUBVALUE: { const auto* Op = IROp->C(); HandleBinopValue(OP_VFSUB, true, OP_F80SUB, 0, true, Op->SrcStack, CurrentIR.GetNode(Op->X80Src), IROp->Op == OP_F80SUBRVALUE); break; } case OP_F80DIVRVALUE: case OP_F80DIVVALUE: { const auto* Op = IROp->C(); HandleBinopValue(OP_VFDIV, true, OP_F80DIV, 0, true, Op->SrcStack, CurrentIR.GetNode(Op->X80Src), IROp->Op == OP_F80DIVRVALUE); break; } case OP_F80MULVALUE: { const auto* Op = IROp->C(); HandleBinopValue(OP_VFMUL, true, OP_F80MUL, 0, true, Op->SrcStack, CurrentIR.GetNode(Op->X80Src)); break; } case OP_F80SQRTSTACK: { HandleUnop(OP_VFSQRT, true, OP_F80SQRT); break; } case OP_F80SINSTACK: { HandleUnop(OP_F64SIN, false, OP_F80SIN); break; } case OP_F80COSSTACK: { HandleUnop(OP_F64COS, false, OP_F80COS); break; } case OP_F80F2XM1STACK: { HandleUnop(OP_F64F2XM1, false, OP_F80F2XM1); break; } case OP_F80PTANSTACK: { HandleUnop(OP_F64TAN, false, OP_F80TAN); Ref OneConst {}; if (ReducedPrecisionMode) { OneConst = IREmit->_VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, GetConstant(0x3FF0000000000000)); } else { OneConst = IREmit->_LoadNamedVectorConstant(OpSize::i128Bit, NamedVectorConstant::NAMED_VECTOR_X87_ONE); } if (SlowPath) { UpdateTopForPush_Slow(); StoreStackValueAtOffset_Slow(OneConst); } else { StackData.push(StackMemberInfo {OneConst}); } break; } case OP_F80SINCOSSTACK: { Ref St0 = LoadStackValue(); Ref SinValue {}; Ref CosValue {}; if (ReducedPrecisionMode) { SinValue = IREmit->_F64SIN(St0); CosValue = IREmit->_F64COS(St0); } #ifdef VIXL_SIMULATOR else if (DisableVixlIndirectCalls() == 0) { SinValue = IREmit->_F80SIN(St0); CosValue = IREmit->_F80COS(St0); } #endif else { SinValue = IREmit->_AllocateFPR(OpSize::i128Bit, OpSize::i128Bit); CosValue = IREmit->_AllocateFPR(OpSize::i128Bit, OpSize::i128Bit); IREmit->_F80SINCOS(St0, SinValue, CosValue); } // Push values if (SlowPath) { StoreStackValueAtOffset_Slow(SinValue, 0, false); UpdateTopForPush_Slow(); StoreStackValueAtOffset_Slow(CosValue, 0, true); } else { StackData.setTop(StackMemberInfo {SinValue}); StackData.push(StackMemberInfo {CosValue}); } break; } case OP_INITSTACK: { StackData.clear(); InvalidateCachedRegs(); break; } case OP_INVALIDATESTACK: { const auto* Op = IROp->C(); auto Offset = Op->StackLocation; if (Offset != 0xff) { // invalidate single offset if (SlowPath) { SetX87ValidTag(Offset, false); } else { StackData.setTagInvalid(Offset); } } else { // invalidate all if (SlowPath) { TopValidCache.fill(StackSlot::INVALID); FlushValidPending = true; } else { for (size_t i = 0; i < StackData.size; i++) { StackData.setTagInvalid(i); } } } break; } case OP_PUSHSTACK: { const auto* Op = IROp->C(); auto* SourceNode = CurrentIR.GetNode(Op->X80Src); if (SlowPath) { UpdateTopForPush_Slow(); StoreStackValueAtOffset_Slow(SourceNode); } else { auto* SourceNode = CurrentIR.GetNode(Op->X80Src); if (Op->OriginalValue.IsInvalid()) { // No original value to track - just push the converted data StackData.push(StackMemberInfo {SourceNode}); } else { auto* OriginalNode = CurrentIR.GetNode(Op->OriginalValue); StackData.push(StackMemberInfo {SourceNode, OriginalNode, Op->LoadSize}); } } break; } case OP_COPYPUSHSTACK: { const auto* Op = IROp->C(); auto Offset = Op->StackLocation; auto Value = MigrateToSlowPath_IfInvalid(Offset); if (SlowPath) { Ref St0 = LoadStackValueAtOffset_Slow(Offset); UpdateTopForPush_Slow(); StoreStackValueAtOffset_Slow(St0); } else { StackData.push(*Value); } break; } case OP_READSTACKVALUE: { const auto* Op = IROp->C(); auto Offset = Op->StackLocation; Ref NewValue = LoadStackValue(Offset); IREmit->ReplaceUsesWithAfter(CodeNode, NewValue, CodeNode); break; } case OP_STACKVALIDTAG: { // Returns 0 if value is valid and 1 otherwise. const auto* Op = IROp->C(); auto Offset = Op->StackLocation; auto Value = MigrateToSlowPath_IfInvalid(Offset); Ref Tag {}; if (SlowPath) { Tag = GetX87ValidTag_Slow(Offset); } else { Tag = Value ? GetConstant(1) : GetConstant(0); } IREmit->ReplaceUsesWithAfter(CodeNode, Tag, CodeNode); break; } case OP_STORESTACKMEM: { const auto* Op = IROp->C(); const auto& Value = MigrateToSlowPath_IfInvalid(); Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode; Ref AddrNode = CurrentIR.GetNode(Op->Addr); Ref Offset = CurrentIR.GetNode(Op->Offset); OpSize Align = Op->Align; MemOffsetType OffsetType = Op->OffsetType; uint8_t OffsetScale = Op->OffsetScale; // On the fast path we can optimize memory copies. // If we are doing: // fld dword [rax] // fst dword [rbx] // We can optimize this to: // ldr w2, [x0] // str w2, [x1] // or similar. As long as the source size and dest size are one and the same. // This will avoid any conversions between source and stack element size and conversion back. OpSize StoreSize = Op->StoreSize; LOGMAN_THROW_A_FMT(Op->StoreSize == OpSize::i32Bit || Op->StoreSize == OpSize::i64Bit || Op->StoreSize == OpSize::f80Bit, "Invalid store size in x87 store stack mem"); if (!SlowPath && Value->Source && Value->Source->Size == StoreSize) { Ref SourceValue = Value->Source->Node; if (Op->StoreSize == OpSize::f80Bit) { Store80BitToMem(Op, SourceValue, AddrNode, Offset, Align, OffsetType, OffsetScale); } else { IREmit->_StoreMemFPR(StoreSize, SourceValue, AddrNode, Offset, Align, OffsetType, OffsetScale); } break; } if (ReducedPrecisionMode) { StoreStackMem_Reduced_Helper(Op, StackNode); break; } StoreStackMem_Helper(Op, StackNode); break; } case OP_STORESTACKTOSTACK: { // stores top of stack in another place in stack. const auto* Op = IROp->C(); auto Offset = Op->StackLocation; if (Offset != 0) { auto Value = MigrateToSlowPath_IfInvalid(); // Need to store st0 to stack location - basically a copy. if (SlowPath) { StoreStackValueAtOffset_Slow(LoadStackValueAtOffset_Slow(), Offset); } else { StackData.setTop(*Value, Offset); } } break; } case OP_POPSTACKDESTROY: { if (SlowPath) { SetX87ValidTag(0, false); } StackPop(); break; } case OP_F80STACKXCHANGE: { const auto* Op = IROp->C(); auto Offset = Op->SrcStack; if (Offset == 0) { // No-op break; } const auto [ValidTop, StackMemberTop] = StackData.top(0); const auto [ValidOffset, StackMemberOffset] = StackData.top(Offset); if (ValidTop != StackSlot::VALID || ValidOffset != StackSlot::VALID) { // Slow path: do actual memory operations Ref ValueTop = LoadStackValue(); Ref ValueOffset = LoadStackValue(Offset); StoreStackValue(ValueOffset); StoreStackValue(ValueTop, Offset); } else { // Fast path: swap complete StackMemberInfo preserving Source metadata StackData.setTop(StackMemberOffset, 0); StackData.setTop(StackMemberTop, Offset); } break; } case OP_F80STACKCHANGESIGN: { Ref Value = LoadStackValue(); // We need a couple of intermediate instructions to change the sign // of a value Ref ResultNode {}; if (ReducedPrecisionMode) { ResultNode = IREmit->_VFNeg(OpSize::i64Bit, OpSize::i64Bit, Value); } else { Ref HelperNode = IREmit->_LoadNamedVectorConstant(OpSize::i128Bit, IR::NamedVectorConstant::NAMED_VECTOR_F80_SIGN_MASK); ResultNode = IREmit->_VXor(OpSize::i128Bit, OpSize::i8Bit, Value, HelperNode); } StoreStackValue(ResultNode); break; } case OP_F80STACKABS: { Ref Value = LoadStackValue(); Ref ResultNode {}; if (ReducedPrecisionMode) { ResultNode = IREmit->_VFAbs(OpSize::i64Bit, OpSize::i64Bit, Value); } else { // Intermediate insts Ref HelperNode = IREmit->_LoadNamedVectorConstant(OpSize::i128Bit, IR::NamedVectorConstant::NAMED_VECTOR_F80_SIGN_MASK); ResultNode = IREmit->_VAndn(OpSize::i128Bit, OpSize::i8Bit, Value, HelperNode); } StoreStackValue(ResultNode); break; } case OP_F80CMPSTACK: { const auto* Op = IROp->C(); auto Offset = Op->SrcStack; Ref StackValue1 = LoadStackValue(); Ref StackValue2 = LoadStackValue(Offset); Ref CmpNode {}; if (ReducedPrecisionMode) { CmpNode = IREmit->_FCmp(OpSize::i64Bit, StackValue1, StackValue2); } else { CmpNode = IREmit->_F80Cmp(StackValue1, StackValue2); } IREmit->ReplaceUsesWithAfter(CodeNode, CmpNode, CodeNode); break; } case OP_F80STACKTEST: { const auto* Op = IROp->C(); auto Offset = Op->SrcStack; auto StackNode = LoadStackValue(Offset); Ref ZeroConst = IREmit->_VCastFromGPR(ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit, OpSize::i64Bit, GetConstant(0)); Ref CmpNode {}; if (ReducedPrecisionMode) { CmpNode = IREmit->_FCmp(OpSize::i64Bit, StackNode, ZeroConst); } else { CmpNode = IREmit->_F80Cmp(StackNode, ZeroConst); } IREmit->ReplaceUsesWithAfter(CodeNode, CmpNode, CodeNode); break; } case OP_F80CMPVALUE: { const auto* Op = IROp->C(); const auto& Value = CurrentIR.GetNode(Op->X80Src); auto StackNode = LoadStackValue(); Ref CmpNode {}; if (ReducedPrecisionMode) { CmpNode = IREmit->_FCmp(OpSize::i64Bit, StackNode, Value); } else { CmpNode = IREmit->_F80Cmp(StackNode, Value); } IREmit->ReplaceUsesWithAfter(CodeNode, CmpNode, CodeNode); break; } case OP_SYNCSTACKTOSLOW: { // This synchronizes stack values but doesn't necessarily moves us off the FastPath! Ref NewTop = SynchronizeStackValues(); FlushCachedRegs(); IREmit->ReplaceUsesWithAfter(CodeNode, NewTop, CodeNode); break; } case OP_STACKFORCESLOW: { MigrateToSlowPathIf(true); InvalidateCachedRegs(); break; } case OP_INCSTACKTOP: { if (SlowPath) { UpdateTopForPop_Slow(); } else { StackData.rotate(false); } break; } case OP_DECSTACKTOP: { if (SlowPath) { UpdateTopForPush_Slow(); } else { StackData.rotate(true); } break; } case OP_F80ROUNDSTACK: { Ref St0 = LoadStackValue(); Ref Value {}; if (ReducedPrecisionMode) { Value = IREmit->_Vector_FToI(OpSize::i64Bit, OpSize::i64Bit, St0, RoundMode::Host); } else { Value = IREmit->_F80Round(St0); } StoreStackValue(Value); break; } case OP_F80VBSLSTACK: { const auto* Op = IROp->C(); auto StackOffset1 = Op->SrcStack1; auto StackOffset2 = Op->SrcStack2; Ref Value1 = LoadStackValue(StackOffset1); Ref Value2 = LoadStackValue(StackOffset2); Ref StackNode = IREmit->_VBSL(OpSize::i128Bit, CurrentIR.GetNode(Op->VectorMask), Value1, Value2); StoreStackValue(StackNode, 0, StackOffset1 && StackOffset2); break; } default: LOGMAN_THROW_A_FMT(false, "IROp was expected to be lowered"); } IREmit->Remove(CodeNode); } auto Last = CurrentIR.at(BlockIROp->Last); --Last; auto [LastCodeNode, LastIROp] = Last(); LOGMAN_THROW_A_FMT(IsBlockExit(LastIROp->Op), "must be exit"); IREmit->SetWriteCursorBefore(LastCodeNode); SynchronizeStackValues(); FlushCachedRegs(); } return; } fextl::unique_ptr CreateX87StackOptimizationPass(const FEXCore::HostFeatures& Features, OpSize GPROpSize) { return fextl::make_unique(Features, GPROpSize); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/Passes.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include namespace FEXCore { class CPUIDEmu; struct HostFeatures; } // namespace FEXCore namespace FEXCore::Utils { class IntrusivePooledAllocator; } namespace FEXCore::IR { class Pass; class RegisterAllocationPass; fextl::unique_ptr CreateDeadFlagCalculationEliminination(); fextl::unique_ptr CreateRegisterAllocationPass(const FEXCore::CPUIDEmu* CPUID); fextl::unique_ptr CreateX87StackOptimizationPass(const FEXCore::HostFeatures&, OpSize GPROpSize); namespace Validation { fextl::unique_ptr CreateIRValidation(); } // namespace Validation namespace Debug { fextl::unique_ptr CreateIRDumper(); } } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Interface/IR/RegisterAllocationData.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "IR.h" #include namespace FEXCore::IR { union PhysicalRegister { uint8_t Raw; struct { // 32 maximum physical registers uint8_t Reg : 5; // 8 Maximum classes uint8_t Class : 3; }; bool operator==(const PhysicalRegister& Other) const { return Raw == Other.Raw; } PhysicalRegister(RegClass Class, uint8_t Reg) : Reg(Reg) , Class(uint8_t(Class)) {} PhysicalRegister(OrderedNodeWrapper Arg) : Raw(Arg.GetImmediate()) {} PhysicalRegister(Ref Node) : Raw(Node->Reg) {} RegClass AsRegClass() const { return RegClass {Class}; } static const PhysicalRegister Invalid() { return PhysicalRegister(RegClass::Invalid, 0); } bool IsInvalid() const { static_assert(uint8_t(RegClass::Invalid) == 0); return Raw == 0; } }; static_assert(sizeof(PhysicalRegister) == 1); } // namespace FEXCore::IR ================================================ FILE: FEXCore/Source/Utils/Allocator/64BitAllocator.cpp ================================================ // SPDX-License-Identifier: MIT #include "Utils/Allocator/FlexBitSet.h" #include "Utils/Allocator/HostAllocator.h" #include "Utils/Allocator/IntrusiveArenaAllocator.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace Alloc::OSAllocator { thread_local FEXCore::Core::InternalThreadState* TLSThread {}; class OSAllocator_64Bit final : public Alloc::HostAllocator { public: OSAllocator_64Bit(); OSAllocator_64Bit(fextl::vector& Regions); virtual ~OSAllocator_64Bit(); void* AllocateSlab(size_t Size) override { return nullptr; } void DeallocateSlab(void* Ptr, size_t Size) override {} void* Mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) override; int Munmap(void* addr, size_t length) override; void LockBeforeFork(FEXCore::Core::InternalThreadState* Thread) override { AllocationMutex.lock(); } void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child) override { if (Child) { AllocationMutex.StealAndDropActiveLocks(); } else { AllocationMutex.unlock(); } } private: // Upper bound is the maximum virtual address space of the host processor uintptr_t UPPER_BOUND = (1ULL << 57); // Lower bound is the starting of the range just past the lower 32bits constexpr static uintptr_t LOWER_BOUND = 0x1'0000'0000ULL; uintptr_t UPPER_BOUND_PAGE = UPPER_BOUND / FEXCore::Utils::FEX_PAGE_SIZE; constexpr static uintptr_t LOWER_BOUND_PAGE = LOWER_BOUND / FEXCore::Utils::FEX_PAGE_SIZE; struct ReservedVMARegion { uintptr_t Base; // Could be number of pages if we want to pack this in to 12 bytes uint64_t RegionSize; }; bool MergeReservedRegionIfPossible(ReservedVMARegion* Region, uintptr_t NextPtr, uint64_t NextSize) { constexpr uint64_t MaxReservedRegionSize = 64ULL * 1024 * 1024 * 1024; // 64GB uintptr_t RegionEnd = Region->Base + Region->RegionSize; uint64_t NewRegionSize = Region->RegionSize + NextSize; if (RegionEnd == NextPtr && NewRegionSize <= MaxReservedRegionSize) { // Append the contiguous region Region->RegionSize = NewRegionSize; return true; } return false; } struct LiveVMARegion { ReservedVMARegion* SlabInfo; uint64_t FreeSpace {}; uint64_t NumManagedPages {}; uint32_t LastPageAllocation {}; bool HadMunmap {}; // Align UsedPages so it pads to the next page. // Necessary to take advantage of madvise zero page pooling. using FlexBitElementType = uint64_t; alignas(FEXCore::Utils::FEX_PAGE_SIZE) FEXCore::FlexBitSet UsedPages; // This returns the size of the LiveVMARegion in addition to the flex set that tracks the used data // The LiveVMARegion lives at the start of the VMA region which means on initialization we need to set that // tracked ranged as used immediately static size_t GetFEXManagedVMARegionSize(size_t Size) { // One element per page // 0x10'0000'0000 bytes // 0x100'0000 Pages // 1 bit per page for tracking means 0x20'0000 (Pages / 8) bytes of flex space // Which is 2MB of tracking const uint64_t NumElements = Size >> FEXCore::Utils::FEX_PAGE_SHIFT; return sizeof(LiveVMARegion) + FEXCore::FlexBitSet::SizeInBytes(NumElements); } static void InitializeVMARegionUsed(LiveVMARegion* Region, size_t AdditionalSize) { size_t SizeOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetFEXManagedVMARegionSize(Region->SlabInfo->RegionSize), FEXCore::Utils::FEX_PAGE_SIZE); size_t SizePlusManagedData = SizeOfLiveRegion + AdditionalSize; Region->FreeSpace = Region->SlabInfo->RegionSize - SizePlusManagedData; size_t NumManagedPages = SizePlusManagedData >> FEXCore::Utils::FEX_PAGE_SHIFT; size_t ManagedSize = NumManagedPages << FEXCore::Utils::FEX_PAGE_SHIFT; // Use madvise to set the full tracking region to zero. // This ensures unused pages are zero, while not having the backing pages consuming memory. ::madvise(Region->UsedPages.Memory + ManagedSize, (Region->SlabInfo->RegionSize >> FEXCore::Utils::FEX_PAGE_SHIFT) - ManagedSize, MADV_DONTNEED); // Use madvise to claim WILLNEED on the beginning pages for initial state tracking. // Improves performance of the following MemClear by not doing a page level fault dance for data necessary to track >170TB of used pages. ::madvise(Region->UsedPages.Memory, ManagedSize, MADV_WILLNEED); // Set our reserved pages Region->UsedPages.MemSet(NumManagedPages); Region->LastPageAllocation = NumManagedPages; Region->NumManagedPages = NumManagedPages; } }; static_assert(sizeof(LiveVMARegion) == FEXCore::Utils::FEX_PAGE_SIZE, "Needs to be the size of a page"); static_assert(std::is_trivially_copyable::value, "Needs to be trivially copyable"); static_assert(offsetof(LiveVMARegion, UsedPages) == sizeof(LiveVMARegion), "FlexBitSet needs to be at the end"); using ReservedRegionListType = fex_pmr::list; using LiveRegionListType = fex_pmr::list; ReservedRegionListType* ReservedRegions {}; LiveRegionListType* LiveRegions {}; Alloc::ForwardOnlyIntrusiveArenaAllocator* ObjectAlloc {}; FEXCore::ForkableUniqueMutex AllocationMutex; void DetermineVASize(); LiveVMARegion* MakeRegionActive(ReservedRegionListType::iterator ReservedIterator, uint64_t UsedSize) { ReservedVMARegion* ReservedRegion = *ReservedIterator; ReservedRegions->erase(ReservedIterator); // mprotect the new region we've allocated size_t SizeOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetFEXManagedVMARegionSize(ReservedRegion->RegionSize), FEXCore::Utils::FEX_PAGE_SIZE); size_t SizePlusManagedData = UsedSize + SizeOfLiveRegion; auto Res = mprotect(reinterpret_cast(ReservedRegion->Base), SizePlusManagedData, PROT_READ | PROT_WRITE); LOGMAN_THROW_A_FMT(Res != -1, "Couldn't mprotect region: {} '{}' Likely occurs when running out of memory or Maximum VMAs", errno, strerror(errno)); FEXCore::Allocator::VirtualName("FEXMem_Misc", reinterpret_cast(ReservedRegion->Base), SizePlusManagedData); LiveVMARegion* LiveRange = new (reinterpret_cast(ReservedRegion->Base)) LiveVMARegion(); // Copy over the reserved data LiveRange->SlabInfo = ReservedRegion; // Initialize VMA LiveVMARegion::InitializeVMARegionUsed(LiveRange, UsedSize); // Add to our active tracked ranges auto LiveIter = LiveRegions->emplace_back(LiveRange); return LiveIter; } void AllocateMemoryRegions(fextl::vector& Ranges); LiveVMARegion* FindLiveRegionForAddress(uintptr_t Addr, uintptr_t AddrEnd); }; void OSAllocator_64Bit::DetermineVASize() { size_t Bits = FEXCore::Allocator::DetermineVASize(); uintptr_t Size = 1ULL << Bits; UPPER_BOUND = Size; #if ARCHITECTURE_x86_64 // Last page cannot be allocated on x86 UPPER_BOUND -= FEXCore::Utils::FEX_PAGE_SIZE; #endif UPPER_BOUND_PAGE = UPPER_BOUND / FEXCore::Utils::FEX_PAGE_SIZE; } OSAllocator_64Bit::LiveVMARegion* OSAllocator_64Bit::FindLiveRegionForAddress(uintptr_t Addr, uintptr_t AddrEnd) { LiveVMARegion* LiveRegion {}; // Check active slabs to see if we can fit this for (auto it = LiveRegions->begin(); it != LiveRegions->end(); ++it) { uintptr_t RegionBegin = (*it)->SlabInfo->Base; uintptr_t RegionEnd = RegionBegin + (*it)->SlabInfo->RegionSize; if (Addr >= RegionBegin && AddrEnd < RegionEnd) { LiveRegion = *it; // Leave our loop break; } } // Couldn't find an active region that fit // Check reserved regions if (!LiveRegion) { // Didn't have a slab that fit this range // Check our reserved regions to see if we have one that fits for (auto it = ReservedRegions->begin(); it != ReservedRegions->end(); ++it) { ReservedVMARegion* ReservedRegion = *it; uintptr_t RegionEnd = ReservedRegion->Base + ReservedRegion->RegionSize; if (Addr >= ReservedRegion->Base && AddrEnd < RegionEnd) { // Found one, let's make it active LiveRegion = MakeRegionActive(it, 0); break; } } } return LiveRegion; } void* OSAllocator_64Bit::Mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { if (addr != 0 && addr < reinterpret_cast(LOWER_BOUND)) { // If we are asked to allocate something outside of the 64-bit space // Then we need to just hand this to the OS return ::mmap(addr, length, prot, flags, fd, offset); } uint64_t Addr = reinterpret_cast(addr); // Addr must be page aligned if (Addr & ~FEXCore::Utils::FEX_PAGE_MASK) { return reinterpret_cast(-EINVAL); } // If FD is provided then offset must also be page aligned if (fd != -1 && offset & ~FEXCore::Utils::FEX_PAGE_MASK) { return reinterpret_cast(-EINVAL); } // 64bit address overflow if (Addr + length < Addr) { return reinterpret_cast(-EOVERFLOW); } bool Fixed = (flags & MAP_FIXED) || (flags & MAP_FIXED_NOREPLACE); length = FEXCore::AlignUp(length, FEXCore::Utils::FEX_PAGE_SIZE); uint64_t AddrEnd = Addr + length; size_t NumberOfPages = length / FEXCore::Utils::FEX_PAGE_SIZE; // This needs a mutex to be thread safe auto lk = FEXCore::GuardSignalDeferringSectionWithFallback(AllocationMutex, TLSThread); uint64_t AllocatedOffset {}; LiveVMARegion* LiveRegion {}; if (Fixed || Addr != 0) { LiveRegion = FindLiveRegionForAddress(Addr, AddrEnd); } again: struct RangeResult final { LiveVMARegion* RegionInsertedInto; void* Ptr; }; auto CheckIfRangeFits = [&AllocatedOffset](LiveVMARegion* Region, uint64_t length, int prot, int flags, int fd, off_t offset, uint64_t StartingPosition = 0) -> RangeResult { uint64_t AllocatedPage {~0ULL}; uint64_t NumberOfPages = length >> FEXCore::Utils::FEX_PAGE_SHIFT; if (Region->FreeSpace >= length) { uint64_t LastAllocation = StartingPosition ? (StartingPosition - Region->SlabInfo->Base) >> FEXCore::Utils::FEX_PAGE_SHIFT : Region->LastPageAllocation; size_t RegionNumberOfPages = Region->SlabInfo->RegionSize >> FEXCore::Utils::FEX_PAGE_SHIFT; if (Region->HadMunmap) { // Backward scan // We need to do a backward scan first to fill any holes // Otherwise we will very quickly run out of VMA regions (65k maximum) auto SearchResult = Region->UsedPages.BackwardScanForRange(LastAllocation, NumberOfPages, Region->NumManagedPages); AllocatedPage = SearchResult.FoundElement; // If we didn't even have a one page free in the backward search, then unclaim HadMunmap. // Switching over to default forward search. if (SearchResult.FoundElement == ~0ULL && !SearchResult.FoundHole) { Region->HadMunmap = false; } } // Foward Scan if (AllocatedPage == ~0ULL) { auto SearchResult = Region->UsedPages.ForwardScanForRange(LastAllocation, NumberOfPages, RegionNumberOfPages); AllocatedPage = SearchResult.FoundElement; } if (AllocatedPage != ~0ULL) { AllocatedOffset = Region->SlabInfo->Base + AllocatedPage * FEXCore::Utils::FEX_PAGE_SIZE; // We need to setup protections for this void* MMapResult = ::mmap(reinterpret_cast(AllocatedOffset), length, prot, (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED, fd, offset); if (MMapResult == MAP_FAILED) { return RangeResult {Region, reinterpret_cast(-errno)}; } return RangeResult {Region, MMapResult}; } } return {}; }; if (Fixed) { // Found a region let's allocate to it if (LiveRegion) { // Found a slab that fits this if (flags & MAP_FIXED_NOREPLACE) { auto Fits = CheckIfRangeFits(LiveRegion, length, prot, flags, fd, offset, Addr); if (Fits.RegionInsertedInto && Fits.Ptr == reinterpret_cast(Addr)) { // We fit correctly AllocatedOffset = Addr; } else { // Intersected with something that already existed return reinterpret_cast(-EEXIST); } } else { // We need to mmap the file to this location void* MMapResult = ::mmap(reinterpret_cast(Addr), length, prot, (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED, fd, offset); if (MMapResult == MAP_FAILED) { return reinterpret_cast(-errno); } AllocatedOffset = Addr; } // Fall through to live region tracking } } else { // Check our active slabs to see if we can fit the allocation // Slightly different than fixed since it doesn't need exact placement if (LiveRegion && Addr != 0) { // We found a LiveRegion that could hold this address. Let's try to place it // Check if this area is free auto Fits = CheckIfRangeFits(LiveRegion, length, prot, flags, fd, offset, Addr); if (Fits.RegionInsertedInto && Fits.Ptr == reinterpret_cast(Addr)) { // We fit correctly AllocatedOffset = Addr; } else { // Couldn't fit // We can continue past this point still LiveRegion = nullptr; } } if (!LiveRegion) { for (auto it = LiveRegions->begin(); it != LiveRegions->end(); ++it) { auto Fits = CheckIfRangeFits(*it, length, prot, flags, fd, offset); if (Fits.RegionInsertedInto && Fits.Ptr == reinterpret_cast(AllocatedOffset)) { // We fit correctly LiveRegion = Fits.RegionInsertedInto; break; } // Couldn't fit but mmap gave us an error if (!Fits.RegionInsertedInto && Fits.Ptr) { return Fits.Ptr; } // nullptr on both means no error and couldn't fit } } if (!LiveRegion) { // Couldn't find a fit in the live regions // Allocate a new reserved region size_t lengthOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetFEXManagedVMARegionSize(length), FEXCore::Utils::FEX_PAGE_SIZE); size_t lengthPlusManagedData = length + lengthOfLiveRegion; for (auto it = ReservedRegions->begin(); it != ReservedRegions->end(); ++it) { if ((*it)->RegionSize >= lengthPlusManagedData) { MakeRegionActive(it, 0); goto again; } } } } if (LiveRegion) { // Mark the pages as used uintptr_t RegionBegin = LiveRegion->SlabInfo->Base; uintptr_t MappedBegin = (AllocatedOffset - RegionBegin) >> FEXCore::Utils::FEX_PAGE_SHIFT; size_t PagesSet {}; for (size_t i = 0; i < NumberOfPages; ++i) { PagesSet += LiveRegion->UsedPages.TestAndSet(MappedBegin + i) == false; } // Change our last allocation region LiveRegion->LastPageAllocation = MappedBegin + NumberOfPages; LiveRegion->FreeSpace -= PagesSet * FEXCore::Utils::FEX_PAGE_SIZE; LOGMAN_THROW_A_FMT(LiveRegion->FreeSpace <= LiveRegion->SlabInfo->RegionSize, "Corrupt LiveRegion free space! 0x{:x} > 0x{:x}. After allocating 0x{:x} (0x{:x} overlapped)", LiveRegion->FreeSpace, LiveRegion->SlabInfo->RegionSize, length, PagesSet); } if (!AllocatedOffset) { AllocatedOffset = -ENOMEM; } return reinterpret_cast(AllocatedOffset); } int OSAllocator_64Bit::Munmap(void* addr, size_t length) { if (addr < reinterpret_cast(LOWER_BOUND)) { // If we are asked to allocate something outside of the 64-bit space // Then we need to just hand this to the OS return ::munmap(addr, length); } uint64_t Addr = reinterpret_cast(addr); if (Addr & ~FEXCore::Utils::FEX_PAGE_MASK) { return -EINVAL; } if (length & ~FEXCore::Utils::FEX_PAGE_MASK) { return -EINVAL; } if (Addr + length < Addr) { return -EOVERFLOW; } // This needs a mutex to be thread safe auto lk = FEXCore::GuardSignalDeferringSectionWithFallback(AllocationMutex, TLSThread); length = FEXCore::AlignUp(length, FEXCore::Utils::FEX_PAGE_SIZE); uintptr_t PtrBegin = reinterpret_cast(addr); uintptr_t PtrEnd = PtrBegin + length; // Walk all of the live ranges and find this slab then delete it for (auto it = LiveRegions->begin(); it != LiveRegions->end(); ++it) { uintptr_t RegionBegin = (*it)->SlabInfo->Base; uintptr_t RegionEnd = RegionBegin + (*it)->SlabInfo->RegionSize; if (RegionBegin <= PtrBegin && RegionEnd > PtrEnd) { // Live region fully encompasses slab range uint64_t FreedPages {}; uint32_t SlabPageBegin = (PtrBegin - RegionBegin) >> FEXCore::Utils::FEX_PAGE_SHIFT; uint64_t PagesToFree = length >> FEXCore::Utils::FEX_PAGE_SHIFT; for (size_t i = 0; i < PagesToFree; ++i) { FreedPages += (*it)->UsedPages.TestAndClear(SlabPageBegin + i) ? 1 : 0; } if (FreedPages != 0) { // If we were contiuous freeing then make sure to give back the physical address space // If the region was locked then madvise won't remove the physical backing // This woul be a bug in the frontend application // So be careful with mlock/munlock ::madvise(addr, length, MADV_DONTNEED); ::mmap(addr, length, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); } (*it)->FreeSpace += FreedPages * FEXCore::Utils::FEX_PAGE_SIZE; // Set the last allocated page to the minimum of last page allocation or this slab // This will let us more quickly fill holes (*it)->LastPageAllocation = std::min((*it)->LastPageAllocation, SlabPageBegin); (*it)->HadMunmap = true; // XXX: Move region back to reserved list return 0; } } // If it didn't match at all then no error return 0; } void OSAllocator_64Bit::AllocateMemoryRegions(fextl::vector& Ranges) { // Need to allocate the ObjectAlloc up front. Find a region that is larger than our minimum size first. const size_t ObjectAllocSize = 64 * 1024 * 1024; for (auto& it : Ranges) { if (ObjectAllocSize > it.Size) { continue; } // Allocate up to 64 MiB the first allocation for an intrusive allocator mprotect(it.Ptr, ObjectAllocSize, PROT_READ | PROT_WRITE); // This enables the kernel to use transparent large pages in the allocator which can reduce memory pressure ::madvise(it.Ptr, ObjectAllocSize, MADV_HUGEPAGE); FEXCore::Allocator::VirtualName("FEXMem_Misc", reinterpret_cast(it.Ptr), ObjectAllocSize); ObjectAlloc = new (it.Ptr) Alloc::ForwardOnlyIntrusiveArenaAllocator(it.Ptr, ObjectAllocSize); ReservedRegions = ObjectAlloc->new_construct(ReservedRegions, ObjectAlloc); LiveRegions = ObjectAlloc->new_construct(LiveRegions, ObjectAlloc); if (it.Size >= ObjectAllocSize) { // Modify region size it.Size -= ObjectAllocSize; (uint8_t*&)it.Ptr += ObjectAllocSize; } break; } if (!ObjectAlloc) { ERROR_AND_DIE_FMT("Couldn't allocate object allocator!"); } for (auto [Ptr, AllocationSize] : Ranges) { // Skip using any regions that are <= two pages. FEX's VMA allocator requires two pages // for tracking data. So three pages are minimum for a single page VMA allocation. if (AllocationSize <= (FEXCore::Utils::FEX_PAGE_SIZE * 2)) { continue; } ReservedVMARegion* Region = ObjectAlloc->new_construct(); Region->Base = reinterpret_cast(Ptr); Region->RegionSize = AllocationSize; ReservedRegions->emplace_back(Region); } } OSAllocator_64Bit::OSAllocator_64Bit() { DetermineVASize(); auto Ranges = FEXCore::Allocator::StealMemoryRegion(LOWER_BOUND, UPPER_BOUND); AllocateMemoryRegions(Ranges); } OSAllocator_64Bit::OSAllocator_64Bit(fextl::vector& Regions) { AllocateMemoryRegions(Regions); } OSAllocator_64Bit::~OSAllocator_64Bit() { // This needs a mutex to be thread safe auto lk = FEXCore::GuardSignalDeferringSectionWithFallback(AllocationMutex, TLSThread); // Walk the pages and deallocate // First walk the live regions for (auto it = LiveRegions->begin(); it != LiveRegions->end(); ++it) { ::munmap(reinterpret_cast((*it)->SlabInfo->Base), (*it)->SlabInfo->RegionSize); } // Now walk the reserved regions for (auto it = ReservedRegions->begin(); it != ReservedRegions->end(); ++it) { ::munmap(reinterpret_cast((*it)->Base), (*it)->RegionSize); } } fextl::unique_ptr Create64BitAllocator() { return fextl::make_unique(); } template struct alloc_delete : public std::default_delete { void operator()(T* ptr) const { if (ptr) { const auto size = sizeof(T); const auto MinPage = FEXCore::AlignUp(size, FEXCore::Utils::FEX_PAGE_SIZE); std::destroy_at(ptr); ::munmap(ptr, MinPage); } } template requires (std::is_base_of_v) operator fextl::default_delete() { return fextl::default_delete(); } }; template requires (!std::is_array_v) fextl::unique_ptr make_alloc_unique(FEXCore::Allocator::MemoryRegion& Base, Args&&... args) { const auto size = sizeof(T); const auto MinPage = FEXCore::AlignUp(size, FEXCore::Utils::FEX_PAGE_SIZE); if (Base.Size < size || MinPage != FEXCore::Utils::FEX_PAGE_SIZE) { ERROR_AND_DIE_FMT("Couldn't fit allocator in to page!"); } auto ptr = ::mmap(Base.Ptr, MinPage, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (ptr == MAP_FAILED) { ERROR_AND_DIE_FMT("Couldn't allocate memory region"); } FEXCore::Allocator::VirtualName("FEXMem_Misc", reinterpret_cast(ptr), MinPage); // Remove the page from the base region. // Could be zero after this. Base.Size -= MinPage; Base.Ptr = reinterpret_cast(reinterpret_cast(Base.Ptr) + MinPage); auto Result = ::new (ptr) T(std::forward(args)...); return fextl::unique_ptr>(Result); } fextl::unique_ptr Create64BitAllocatorWithRegions(fextl::vector& Regions) { // This is a bit tricky as we can't allocate memory safely except from the Regions provided. Otherwise we might overwrite memory pages we // don't own. Scan the memory regions and find the smallest one. FEXCore::Allocator::MemoryRegion& Smallest = Regions[0]; for (auto& it : Regions) { if (it.Size <= Smallest.Size) { Smallest = it; } } return make_alloc_unique(Smallest, Regions); } } // namespace Alloc::OSAllocator namespace FEXCore::Allocator { void RegisterTLSData(FEXCore::Core::InternalThreadState* Thread) { Alloc::OSAllocator::TLSThread = Thread; } void UninstallTLSData(FEXCore::Core::InternalThreadState* Thread) { Alloc::OSAllocator::TLSThread = nullptr; } } // namespace FEXCore::Allocator ================================================ FILE: FEXCore/Source/Utils/Allocator/FlexBitSet.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include namespace FEXCore { template struct FlexBitSet final { using ElementType = T; constexpr static size_t MinimumSize = sizeof(ElementType); constexpr static size_t MinimumSizeBits = sizeof(ElementType) * 8; T Memory[]; bool Get(size_t Element) const { return (Memory[Element / MinimumSizeBits] & (1ULL << (Element % MinimumSizeBits))) != 0; } bool TestAndClear(size_t Element) { bool Value = Get(Element); Memory[Element / MinimumSizeBits] &= ~(1ULL << (Element % MinimumSizeBits)); return Value; } bool TestAndSet(size_t Element) { bool Value = Get(Element); Memory[Element / MinimumSizeBits] |= (1ULL << (Element % MinimumSizeBits)); return Value; } void Set(size_t Element) { Memory[Element / MinimumSizeBits] |= (1ULL << (Element % MinimumSizeBits)); } void Clear(size_t Element) { Memory[Element / MinimumSizeBits] &= ~(1ULL << (Element % MinimumSizeBits)); } void MemClear(size_t Elements) { memset(Memory, 0, FEXCore::AlignUp(Elements / MinimumSizeBits, MinimumSizeBits)); } void MemSet(size_t Elements) { memset(Memory, 0xFF, FEXCore::AlignUp(Elements / MinimumSizeBits, MinimumSizeBits)); } // Range scanning results struct BitsetScanResults { // Which element was found. ~0ULL if not found. size_t FoundElement; // During the scan, found a hole in the allocations that didn't fit. bool FoundHole; }; // TODO: Make {Forward,Backward}ScanForRange faster // Currently these functions test a single bit at a time, which is fairly costly. // The compiler emits a full element load per iteration, wasting a bunch of time on loads. // If we change these functions to have a pre-amble and post-amble to align the primary loop to the element size then this can go // significantly faster. // // Once the element scanning is aligned to the element size, we can then use native count leading zero(CLZ) and count trailing zero(CTZ) // instructions on a full element to scan uint64_t elements per loop iteration. // Implementation details: // Template argument WantUnset // Used to determine if the desired range is for set or unset ranges. // Typically `WantUnset` should be true. Used for finding a unset range inside of a range will set elements. // // @param BeginningElement - The first element in the set to start scanning from. // @param ElementCount - How many elements to find a range for fitting. // @param MinimumElement - Minimum element in the set to search to // // @return The scan results template BitsetScanResults BackwardScanForRange(size_t BeginningElement, size_t ElementCount, size_t MinimumElement) { bool FoundHole {}; // Final element to iterate to. const size_t FinalElement = MinimumElement + ElementCount - 1; for (size_t CurrentPage = BeginningElement; CurrentPage >= FinalElement;) { size_t Remaining = ElementCount; LOGMAN_THROW_A_FMT(CurrentPage <= BeginningElement && CurrentPage >= FinalElement, "BackwardScanForRange: Scanning less than " "available range"); while (Remaining) { if (this->Get(CurrentPage - Remaining + 1) == WantUnset) { // Has an intersecting range break; } --Remaining; } if (Remaining) { // If we found at least one Element hole then track that if (Remaining != ElementCount) { FoundHole = true; } // Didn't find a slab range CurrentPage -= Remaining; } else { // We have a slab range return BitsetScanResults {CurrentPage - ElementCount + 1, FoundHole}; } } return BitsetScanResults {~0ULL, FoundHole}; } // @param BeginningElement - The first element in the set to start scanning from. // @param ElementCount - How many elements to find a range for fitting. // @param ElementsInSet - How many elements are in the full set. // // @return The scan results template BitsetScanResults ForwardScanForRange(size_t BeginningElement, size_t ElementCount, size_t ElementsInSet) { bool FoundHole {}; // Final element to iterate to. const size_t FinalElement = ElementsInSet - ElementCount + 1; for (size_t CurrentElement = BeginningElement; CurrentElement <= FinalElement;) { // If we have enough free space, check if we have enough free pages that are contiguous size_t Remaining = ElementCount; LOGMAN_THROW_A_FMT(CurrentElement >= BeginningElement && CurrentElement <= FinalElement, "ForwardScanForRange: Scanning less than " "available range"); while (Remaining) { if (this->Get(CurrentElement + Remaining - 1) == WantUnset) { // Has an intersecting range break; } --Remaining; } if (Remaining) { // If we found at least one Element hole then track that if (Remaining != ElementCount) { FoundHole = true; } // Didn't find a slab range CurrentElement += Remaining; } else { // We have a slab range return BitsetScanResults {CurrentElement, FoundHole}; } } return BitsetScanResults {~0ULL, FoundHole}; } // This very explicitly doesn't let you take an address // Is only a getter bool operator[](size_t Element) const { return Get(Element); } // Returns the number of bits required to hold the number of elements. // Just rounds up to the MinimumSizeInBits. constexpr static size_t SizeInBits(uint64_t Elements) { return FEXCore::AlignUp(Elements, MinimumSizeBits); } // Returns the number of bytes required to hold the number of elements. constexpr static size_t SizeInBytes(uint64_t Elements) { return SizeInBits(Elements) / 8; } }; static_assert(sizeof(FlexBitSet) == 0, "This needs to be a flex member"); static_assert(std::is_trivially_copyable_v>, "Needs to be trivially copyable"); } // namespace FEXCore ================================================ FILE: FEXCore/Source/Utils/Allocator/HostAllocator.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include namespace FEXCore::Allocator { struct MemoryRegion; } namespace FEXCore::Core { struct InternalThreadState; } namespace Alloc { // HostAllocator is just a page pased slab allocator // Similar to mmap and munmap only mapping at the page level class HostAllocator { public: HostAllocator() = default; virtual ~HostAllocator() = default; virtual void* AllocateSlab(size_t Size) = 0; virtual void DeallocateSlab(void* Ptr, size_t Size) = 0; virtual void* Mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { return nullptr; } virtual int Munmap(void* addr, size_t length) { return -1; } virtual void LockBeforeFork(FEXCore::Core::InternalThreadState* Thread) {} virtual void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child) {} }; class GlobalAllocator { public: HostAllocator* Alloc {}; GlobalAllocator(HostAllocator* _Alloc) : Alloc {_Alloc} {} virtual ~GlobalAllocator() = default; virtual void* malloc(size_t Size) = 0; virtual void* calloc(size_t num, size_t size) = 0; virtual void* realloc(void* ptr, size_t size) = 0; virtual void* memalign(size_t alignment, size_t size) = 0; virtual void free(void* ptr) = 0; }; } // namespace Alloc namespace Alloc::OSAllocator { fextl::unique_ptr Create64BitAllocator(); fextl::unique_ptr Create64BitAllocatorWithRegions(fextl::vector& Regions); static inline void ReleaseAllocatorWorkaround(fextl::unique_ptr Allocator) { // XXX: This is currently a leak. // We can't work around this yet until static initializers that allocate memory are completely removed from our codebase // The allocator is also intrusively allocated, so the unique_ptr tries to double free the HostAllocator object. // Luckily we only remove this on process shutdown, so the kernel will do the cleanup for us Allocator.release(); } } // namespace Alloc::OSAllocator ================================================ FILE: FEXCore/Source/Utils/Allocator/IntrusiveArenaAllocator.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include "FlexBitSet.h" #include "HostAllocator.h" #include #include #include #include #include namespace fex_pmr = std::pmr; #include #include namespace Alloc { class ForwardOnlyIntrusiveArenaAllocator final : public fex_pmr::memory_resource { public: ForwardOnlyIntrusiveArenaAllocator(void* Ptr, size_t _Size) : Begin {reinterpret_cast(Ptr)} , Size {_Size} { LastAllocation = sizeof(ForwardOnlyIntrusiveArenaAllocator); } ~ForwardOnlyIntrusiveArenaAllocator() = default; template U* new_construct(Args&&... args) { void* Ptr = do_allocate(sizeof(U), alignof(U)); return new (Ptr) U(args...); } template U* new_construct(U* Class, Args&&... args) { void* Ptr = do_allocate(sizeof(U), alignof(U)); return new (Ptr) U(args...); } size_t AmountAllocated() const { return LastAllocation; } private: void* do_allocate(std::size_t bytes, std::size_t alignment) override { size_t PreviousAligned = FEXCore::AlignUp(LastAllocation, alignment); size_t NewOffset = PreviousAligned + bytes; if (NewOffset > Size) { return nullptr; } LastAllocation = NewOffset; return reinterpret_cast(Begin + PreviousAligned); } void do_deallocate(void*, std::size_t, std::size_t) override { // Do nothing } bool do_is_equal(const fex_pmr::memory_resource& other) const noexcept override { // Only if the allocator pointers are the same are they equal if (this == &other) { return true; } // We don't share state with another allocator so we can't share anything return false; } uintptr_t Begin; size_t Size; size_t LastAllocation {}; }; class IntrusiveArenaAllocator final : public fex_pmr::memory_resource { public: IntrusiveArenaAllocator(void* Ptr, size_t _Size) : Begin {reinterpret_cast(Ptr)} , Size {_Size} { uint64_t NumberOfPages = _Size / FEXCore::Utils::FEX_PAGE_SIZE; uint64_t UsedBits = FEXCore::AlignUp(sizeof(IntrusiveArenaAllocator) + Size / FEXCore::Utils::FEX_PAGE_SIZE / 8, FEXCore::Utils::FEX_PAGE_SIZE); for (size_t i = 0; i < UsedBits; ++i) { UsedPages.Set(i); } FreePages = NumberOfPages - UsedBits; } template U* new_construct(Args&&... args) { void* Ptr = do_allocate(sizeof(U), alignof(U)); return new (Ptr) U(args...); } template U* new_construct(U* Class, Args&&... args) { void* Ptr = do_allocate(sizeof(U), alignof(U)); return new (Ptr) U(args...); } uintptr_t GetSlabBase() const { return Begin; } uint64_t GetSlabSize() const { return Size; } uint64_t GetFreePages() const { return FreePages; } private: void* do_allocate(std::size_t bytes, std::size_t alignment) override { std::scoped_lock lk {AllocationMutex}; size_t NumberPages = FEXCore::AlignUp(bytes, FEXCore::Utils::FEX_PAGE_SIZE) / FEXCore::Utils::FEX_PAGE_SIZE; uintptr_t AllocatedOffset {}; try_again: for (uintptr_t CurrentPage = LastAllocatedPageOffset; CurrentPage <= (Size - NumberPages);) { size_t Remaining = NumberPages; while (Remaining) { if (UsedPages[CurrentPage + Remaining - 1]) { // Has an intersecting range break; } --Remaining; } if (Remaining) { // Didn't find an allocation range CurrentPage += Remaining; } else { // We have a range to allocate AllocatedOffset = CurrentPage; break; } } if (!AllocatedOffset && LastAllocatedPageOffset != 0) { // Try again but starting from the beginning LastAllocatedPageOffset = 0; // Using goto so we don't have recursive mutex shenanigans goto try_again; } // Allocated offset must be valid or zero at this point if (AllocatedOffset) { // Map the range as no longer available for (size_t i = 0; i < NumberPages; ++i) { UsedPages.Set(AllocatedOffset + i); } LastAllocatedPageOffset = AllocatedOffset + NumberPages; // Now convert this base page to a pointer and return it return reinterpret_cast(Begin + AllocatedOffset * FEXCore::Utils::FEX_PAGE_SIZE); } return nullptr; } void do_deallocate(void* p, std::size_t bytes, std::size_t alignment) override { std::scoped_lock lk {AllocationMutex}; uintptr_t PageOffset = (reinterpret_cast(p) - Begin) / FEXCore::Utils::FEX_PAGE_SIZE; size_t NumPages = FEXCore::AlignUp(bytes, FEXCore::Utils::FEX_PAGE_SIZE) / FEXCore::Utils::FEX_PAGE_SIZE; // Walk the allocation list and deallocate uint64_t FreedPages {}; for (size_t i = 0; i < NumPages; ++i) { FreedPages += UsedPages.TestAndClear(PageOffset + i) ? 1 : 0; } FreePages += FreedPages; } bool do_is_equal(const fex_pmr::memory_resource& other) const noexcept override { // Only if the allocator pointers are the same are they equal if (this == &other) { return true; } // We don't share state with another allocator so we can't share anything return false; } uintptr_t Begin; size_t Size; uint64_t FreePages {}; size_t LastAllocatedPageOffset {}; std::mutex AllocationMutex {}; // For up to 64GB regions this will require up to 2MB tracking // Needs to be the last element FEXCore::FlexBitSet UsedPages; }; } // namespace Alloc ================================================ FILE: FEXCore/Source/Utils/Allocator.cpp ================================================ // SPDX-License-Identifier: MIT #include "Utils/Allocator/HostAllocator.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _WIN32 #include #include #endif namespace fextl::pmr { static fextl::pmr::default_resource FEXDefaultResource; std::pmr::memory_resource* get_default_resource() { return &FEXDefaultResource; } } // namespace fextl::pmr #ifndef _WIN32 namespace FEXCore::Allocator { MMAP_Hook mmap {::mmap}; MUNMAP_Hook munmap {::munmap}; uint64_t HostVASize {}; using GLIBC_MALLOC_Hook = void* (*)(size_t, const void* caller); using GLIBC_REALLOC_Hook = void* (*)(void*, size_t, const void* caller); using GLIBC_FREE_Hook = void (*)(void*, const void* caller); fextl::unique_ptr Alloc64 {}; void* FEX_mmap(void* addr, size_t length, int prot, int flags, int fd, off_t offset) { void* Result = Alloc64->Mmap(addr, length, prot, flags, fd, offset); if (Result >= (void*)-4096) { errno = -(uint64_t)Result; return (void*)-1; } if (flags & MAP_ANONYMOUS) { VirtualName("FEXMem", Result, length); } return Result; } void VirtualName(const char* Name, void* Ptr, size_t Size) { static bool Supports {true}; if (Supports) { auto Result = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, Ptr, Size, Name); if (Result == -1) { // Disable any additional attempts. Supports = false; } } } int FEX_munmap(void* addr, size_t length) { int Result = Alloc64->Munmap(addr, length); if (Result != 0) { errno = -Result; return -1; } return Result; } #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" static void AssignHookOverrides(size_t PageSize) { SetupAllocatorHooks(FEX_mmap, FEX_munmap); FEXCore::Allocator::mmap = FEX_mmap; FEXCore::Allocator::munmap = FEX_munmap; InitializeAllocator(PageSize); } void SetupHooks(size_t PageSize) { Alloc64 = Alloc::OSAllocator::Create64BitAllocator(); AssignHookOverrides(PageSize); } void ClearHooks() { SetupAllocatorHooks(::mmap, ::munmap); FEXCore::Allocator::mmap = ::mmap; FEXCore::Allocator::munmap = ::munmap; Alloc::OSAllocator::ReleaseAllocatorWorkaround(std::move(Alloc64)); } #pragma GCC diagnostic pop FEX_DEFAULT_VISIBILITY size_t DetermineVASize() { if (HostVASize) { return HostVASize; } static constexpr std::array TLBSizes = { 57, 52, 48, 47, 42, 39, 36, }; for (auto Bits : TLBSizes) { uintptr_t Size = 1ULL << Bits; // Just try allocating // We can't actually determine VA size on ARM safely auto Find = [](uintptr_t Size) -> bool { for (int i = 0; i < 64; ++i) { // Try grabbing a some of the top pages of the range // x86 allocates some high pages in the top end void* Ptr = ::mmap(reinterpret_cast(Size - FEXCore::Utils::FEX_PAGE_SIZE * i), FEXCore::Utils::FEX_PAGE_SIZE, PROT_NONE, MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (Ptr != (void*)~0ULL) { ::munmap(Ptr, FEXCore::Utils::FEX_PAGE_SIZE); if (Ptr == (void*)(Size - FEXCore::Utils::FEX_PAGE_SIZE * i)) { return true; } } } return false; }; if (Find(Size)) { HostVASize = Bits; return Bits; } } LOGMAN_MSG_A_FMT("Couldn't determine host VA size"); FEX_UNREACHABLE; } #define STEAL_LOG(...) // fprintf(stderr, __VA_ARGS__) fextl::vector CollectMemoryGaps(uintptr_t Begin, uintptr_t End, int MapsFD) { fextl::vector Regions; uintptr_t RegionEnd = 0; char Buffer[2048]; const char* Cursor = Buffer; ssize_t Remaining = 0; bool EndOfFileReached = false; while (true) { const auto line_begin = Cursor; auto line_end = std::find(line_begin, Cursor + Remaining, '\n'); // Check if the buffered data covers the entire line. // If not, try buffering more data. if (line_end == Cursor + Remaining) { if (EndOfFileReached) { // No more data to buffer. Add remaining memory and return. const auto MapBegin = std::max(RegionEnd, Begin); STEAL_LOG("[%d] EndOfFile; MapBegin: %016lX MapEnd: %016lX\n", __LINE__, MapBegin, End); if (End > MapBegin) { Regions.push_back({(void*)MapBegin, End - MapBegin}); } return Regions; } // Move pending content back to the beginning, then buffer more data. std::copy(Cursor, Cursor + Remaining, std::begin(Buffer)); auto PendingBytes = Remaining; do { Remaining = read(MapsFD, Buffer + PendingBytes, sizeof(Buffer) - PendingBytes); } while (Remaining == -1 && errno == EAGAIN); if (Remaining < sizeof(Buffer) - PendingBytes) { EndOfFileReached = true; } Remaining += PendingBytes; Cursor = Buffer; continue; } // Parse mapped region in the format "fffff7cc3000-fffff7cc4000 r--p ..." { uintptr_t RegionBegin {}; auto result = std::from_chars(Cursor, line_end, RegionBegin, 16); LogMan::Throw::AFmt(result.ec == std::errc {} && *result.ptr == '-', "Unexpected line format"); Cursor = result.ptr + 1; // Add gap between the previous region and the current one const auto MapBegin = std::max(RegionEnd, Begin); const auto MapEnd = std::min(RegionBegin, End); if (MapEnd > MapBegin) { Regions.push_back({(void*)MapBegin, MapEnd - MapBegin}); } result = std::from_chars(Cursor, line_end, RegionEnd, 16); LogMan::Throw::AFmt(result.ec == std::errc {} && *result.ptr == ' ', "Unexpected line format"); Cursor = result.ptr + 1; STEAL_LOG("[%d] parsed line: RegionBegin=%016lX RegionEnd=%016lX\n", __LINE__, RegionBegin, RegionEnd); if (RegionEnd >= End) { // Early return if we are completely beyond the allocation space. return Regions; } } Remaining -= line_end + 1 - line_begin; Cursor = line_end + 1; } FEX_UNREACHABLE; } fextl::vector StealMemoryRegion(uintptr_t Begin, uintptr_t End) { const uintptr_t StackLocation_u64 = reinterpret_cast(alloca(0)); const int MapsFD = open("/proc/self/maps", O_RDONLY); LogMan::Throw::AFmt(MapsFD != -1, "Failed to open /proc/self/maps"); auto Regions = CollectMemoryGaps(Begin, End, MapsFD); close(MapsFD); // If the memory bounds include the stack, blocking all memory regions will // limit the stack size to the current value. To allow some stack growth, // we don't block the memory gap directly below the stack memory but // instead map it as readable+writable. { auto StackRegionIt = std::find_if(Regions.begin(), Regions.end(), [StackLocation_u64](auto& Region) { return reinterpret_cast(Region.Ptr) + Region.Size > StackLocation_u64; }); // If no gap crossing the stack pointer was found but the SP is within // the given bounds, the stack mapping is right after the last gap. bool IsStackMapping = StackRegionIt != Regions.end() || StackLocation_u64 <= End; if (IsStackMapping && StackRegionIt != Regions.begin() && reinterpret_cast(std::prev(StackRegionIt)->Ptr) + std::prev(StackRegionIt)->Size <= End) { // Allocate the region under the stack as READ | WRITE so the stack can still grow --StackRegionIt; auto Alloc = ::mmap(StackRegionIt->Ptr, StackRegionIt->Size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED, -1, 0); LogMan::Throw::AFmt(Alloc != MAP_FAILED, "StealMemoryRegion:Stack: mmap({}, {:x}) failed: {}", fmt::ptr(StackRegionIt->Ptr), StackRegionIt->Size, errno); LogMan::Throw::AFmt(Alloc == StackRegionIt->Ptr, "mmap returned {} instead of {}", Alloc, fmt::ptr(StackRegionIt->Ptr)); Regions.erase(StackRegionIt); } } // Block remaining memory gaps for (auto RegionIt = Regions.begin(); RegionIt != Regions.end(); ++RegionIt) { auto Alloc = ::mmap(RegionIt->Ptr, RegionIt->Size, PROT_NONE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED_NOREPLACE, -1, 0); LogMan::Throw::AFmt(Alloc != MAP_FAILED, "StealMemoryRegion: mmap({}, {:x}) failed: {}", fmt::ptr(RegionIt->Ptr), RegionIt->Size, errno); LogMan::Throw::AFmt(Alloc == RegionIt->Ptr, "mmap returned {} instead of {}", Alloc, fmt::ptr(RegionIt->Ptr)); } return Regions; } fextl::vector Setup48BitAllocatorIfExists(size_t PageSize) { size_t Bits = FEXCore::Allocator::DetermineVASize(); if (Bits < 48) { return {}; } uintptr_t Begin48BitVA = 0x0'8000'0000'0000ULL; uintptr_t End48BitVA = 0x1'0000'0000'0000ULL; auto Regions = StealMemoryRegion(Begin48BitVA, End48BitVA); Alloc64 = Alloc::OSAllocator::Create64BitAllocatorWithRegions(Regions); AssignHookOverrides(PageSize); return Regions; } void ReclaimMemoryRegion(const fextl::vector& Regions) { for (const auto& Region : Regions) { ::munmap(Region.Ptr, Region.Size); } } void LockBeforeFork(FEXCore::Core::InternalThreadState* Thread) { if (Alloc64) { Alloc64->LockBeforeFork(Thread); } } void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child) { if (Alloc64) { Alloc64->UnlockAfterFork(Thread, Child); } } } // namespace FEXCore::Allocator #endif ================================================ FILE: FEXCore/Source/Utils/Allocator.h ================================================ // SPDX-License-Identifier: MIT #pragma once namespace FEXCore::Core { struct InternalThreadState; } namespace FEXCore::Allocator { void LockBeforeFork(FEXCore::Core::InternalThreadState* Thread); void UnlockAfterFork(FEXCore::Core::InternalThreadState* Thread, bool Child); } // namespace FEXCore::Allocator ================================================ FILE: FEXCore/Source/Utils/AllocatorHooks.cpp ================================================ // SPDX-License-Identifier: MIT #ifdef ENABLE_FEX_ALLOCATOR #include #ifndef _WIN32 #include #include #include #else #define NTDDI_VERSION 0x0A000005 #include #endif #endif #include #include #include #include #include #include namespace FEXCore::Allocator { using mmap_hook_type = void* (*)(void* addr, size_t length, int prot, int flags, int fd, off_t offset); using munmap_hook_type = int (*)(void* addr, size_t length); #ifdef ENABLE_FEX_ALLOCATOR typedef void* (*rp_mmap_hook_type)(size_t size, size_t alignment, size_t* offset, size_t* mapped_size); typedef void (*rp_munmap_hook_type)(void* address, size_t offset, size_t mapped_size); extern "C" rp_mmap_hook_type rp_mmap_hook; extern "C" rp_munmap_hook_type rp_munmap_hook; #ifndef _WIN32 mmap_hook_type fex_mmap_hook = ::mmap; munmap_hook_type fex_munmap_hook = ::munmap; #endif // Assume a 64KB page size until told otherwise. static rpmalloc_config_t global_config { .page_size = 64 * 1024, // THP causes crashes for some reason. .enable_huge_pages = 0, .disable_decommit = 0, .page_name = "FEXAllocator", .huge_page_name = "FEXAllocator", .unmap_on_finalize = 0, }; void* malloc(size_t size) { return ::rpmalloc(size); } void* calloc(size_t n, size_t size) { return ::rpcalloc(n, size); } void* memalign(size_t align, size_t s) { return ::rpmemalign(align, s); } void* valloc(size_t size) { return ::rpaligned_alloc(global_config.page_size, size); } int posix_memalign(void** r, size_t a, size_t s) { void* ptr; auto res = ::rpposix_memalign(&ptr, a, s); *r = ptr; return res; } void* realloc(void* ptr, size_t size) { return ::rprealloc(ptr, size); } void free(void* ptr) { return ::rpfree(ptr); } size_t malloc_usable_size(void* ptr) { return ::rpmalloc_usable_size(ptr); } void* aligned_alloc(size_t a, size_t s) { return ::rpaligned_alloc(a, s); } void aligned_free(void* ptr) { return ::rpfree(ptr); } void InitializeThread() { rpmalloc_thread_initialize(); } #ifndef _WIN32 [[nodiscard]] constexpr uint64_t AlignUp(uint64_t value, uint64_t size) { return value + (size - value % size) % size; } static void* FEX_rp_mmap(size_t size, size_t alignment, size_t* offset, size_t* mapped_size) { #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) // If the alignment is less than the operating page size then alignment is guaranteed. Just remove it. if (alignment < global_config.page_size) { alignment = 0; } size_t map_size = AlignUp(size + alignment, global_config.page_size); auto ptr = fex_mmap_hook(0, map_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ptr == MAP_FAILED) { ptr = nullptr; } else { #ifndef PR_SET_VMA #define PR_SET_VMA 0x53564d41 #endif #ifndef PR_SET_VMA_ANON_NAME #define PR_SET_VMA_ANON_NAME 0 #endif prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, map_size, global_config.page_name); // Disable HUGEPAGE on allocation from rpmalloc. madvise(ptr, map_size, MADV_NOHUGEPAGE); } if (ptr == nullptr) { fprintf(stderr, "Failed to map VMA region."); return nullptr; } if (alignment) { size_t padding = ((uintptr_t)ptr & (uintptr_t)(alignment - 1)); if (padding) { padding = alignment - padding; } ptr = pointer_offset(ptr, padding); *offset = padding; } *mapped_size = map_size; return ptr; } static void FEX_rp_memory_commit(void* address, size_t size) { // NOP-implementation. } static void FEX_rp_memory_decommit(void* address, size_t size) { if (global_config.disable_decommit) { return; } if (madvise(address, size, MADV_DONTNEED)) { fprintf(stderr, "Failed to decommit VMA region."); } } static void FEX_rp_memory_unmap(void* address, size_t offset, size_t mapped_size) { address = pointer_offset(address, -(int32_t)offset); int Result = fex_munmap_hook(address, mapped_size); if (Result == -1) { fprintf(stderr, "Failed to unmap VMA region."); } #undef pointer_offset } void SetupAllocatorHooks(mmap_hook_type MMapHook, munmap_hook_type MunmapHook) { fex_mmap_hook = MMapHook; fex_munmap_hook = MunmapHook; } static rpmalloc_interface_t global_interface { .memory_map = FEX_rp_mmap, .memory_commit = FEX_rp_memory_commit, .memory_decommit = FEX_rp_memory_decommit, .memory_unmap = FEX_rp_memory_unmap, .map_fail_callback = nullptr, .error_callback = nullptr, }; void InitializeAllocator(size_t PageSize) { global_config.page_size = PageSize; rpmalloc_initialize_config(&global_interface, &global_config); rp_mmap_hook = FEX_rp_mmap; rp_munmap_hook = FEX_rp_memory_unmap; } #endif #elif defined(_WIN32) #error "Tried building _WIN32 without jemalloc" #else void InitializeThread() {} void* malloc(size_t size) { return ::malloc(size); } void* calloc(size_t n, size_t size) { return ::calloc(n, size); } void* memalign(size_t align, size_t s) { return ::memalign(align, s); } void* valloc(size_t size) { return ::valloc(size); } int posix_memalign(void** r, size_t a, size_t s) { return ::posix_memalign(r, a, s); } void* realloc(void* ptr, size_t size) { return ::realloc(ptr, size); } void free(void* ptr) { return ::free(ptr); } size_t malloc_usable_size(void* ptr) { return ::malloc_usable_size(ptr); } void* aligned_alloc(size_t a, size_t s) { return ::aligned_alloc(a, s); } void aligned_free(void* ptr) { return ::free(ptr); } void SetupAllocatorHooks(mmap_hook_type MMapHook, munmap_hook_type MunmapHook) {} void InitializeAllocator(size_t PageSize) {} #endif } // namespace FEXCore::Allocator ================================================ FILE: FEXCore/Source/Utils/AllocatorOverride.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #include #include #include #include #include #include extern "C" { // The majority of FEX internal code should avoid using the glibc allocator. To ensure glibc allocations don't accidentally slip // in, FEX overrides these glibc functions with faulting variants. // // A notable exception is thunks, which should still use glibc allocations and avoid using `fextl::` namespace. // // Other minor exceptions throughout FEX use the `YesIKnowImNotSupposedToUseTheGlibcAllocator` helper to temporarily disable faulting. #define GLIBC_ALIAS_FUNCTION(func) __attribute__((alias(#func), visibility("default"))) extern void* __libc_calloc(size_t, size_t); void* calloc(size_t, size_t) GLIBC_ALIAS_FUNCTION(fault_calloc); extern void __libc_free(void*); void free(void*) GLIBC_ALIAS_FUNCTION(fault_free); extern void* __libc_malloc(size_t); void* malloc(size_t) GLIBC_ALIAS_FUNCTION(fault_malloc); extern void* __libc_memalign(size_t, size_t); void* memalign(size_t, size_t) GLIBC_ALIAS_FUNCTION(fault_memalign); extern void* __libc_realloc(void*, size_t); void* realloc(void*, size_t) GLIBC_ALIAS_FUNCTION(fault_realloc); extern void* __libc_valloc(size_t); void* valloc(size_t) GLIBC_ALIAS_FUNCTION(fault_valloc); extern int __posix_memalign(void**, size_t, size_t); int posix_memalign(void**, size_t, size_t) GLIBC_ALIAS_FUNCTION(fault_posix_memalign); extern size_t __malloc_usable_size(void*); size_t malloc_usable_size(void*) GLIBC_ALIAS_FUNCTION(fault_malloc_usable_size); // Reuse __libc_memalign void* aligned_alloc(size_t, size_t) GLIBC_ALIAS_FUNCTION(fault_aligned_alloc); } namespace FEXCore::Allocator { // Enable or disable allocation faulting globally. static bool GlobalEvaluate {}; // Enable or disable allocation faulting per-thread. static thread_local uint64_t SkipEvalForThread {}; // Internal memory allocation hooks to allow non-faulting allocations through. auto calloc_ptr = __libc_calloc; auto free_ptr = __libc_free; auto malloc_ptr = __libc_malloc; auto memalign_ptr = __libc_memalign; auto realloc_ptr = __libc_realloc; auto valloc_ptr = __libc_valloc; auto posix_memalign_ptr = ::posix_memalign; auto malloc_usable_size_ptr = ::malloc_usable_size; auto aligned_alloc_ptr = __libc_memalign; // Constructor for per-thread allocation faulting check. YesIKnowImNotSupposedToUseTheGlibcAllocator::YesIKnowImNotSupposedToUseTheGlibcAllocator() { ++SkipEvalForThread; } // Destructor for per-thread allocation faulting check. YesIKnowImNotSupposedToUseTheGlibcAllocator::~YesIKnowImNotSupposedToUseTheGlibcAllocator() { --SkipEvalForThread; } // Hard disabling of per-thread allocation fault checking. // No coming back from this, used on thread destruction. FEX_DEFAULT_VISIBILITY void YesIKnowImNotSupposedToUseTheGlibcAllocator::HardDisable() { // Just set it to half of its maximum value so it never wraps back around. SkipEvalForThread = std::numeric_limits::max() / 2; } // Enable global fault checking. void SetupFaultEvaluate() { GlobalEvaluate = true; } // Disable global fault checking. void ClearFaultEvaluate() { GlobalEvaluate = false; } // Evaluate if a glibc hooked allocation should fault. void EvaluateReturnAddress(void* Return) { if (!GlobalEvaluate) { // Fault evaluation disabled globally. return; } if (SkipEvalForThread) { // Fault evaluation currently disabled for this thread. return; } // We don't know where we are when allocating. Make sure to be safe and generate the string on the stack. // Print an error message to let a developer know that an allocation faulted. char Tmp[512]; auto Res = fmt::format_to_n(Tmp, 512, "ERROR: Requested memory using non-FEX allocator at 0x{:x}\n", reinterpret_cast(Return)); Tmp[Res.size] = 0; write(STDERR_FILENO, Tmp, Res.size); // Trap the execution to stop FEX in its tracks. FEX_TRAP_EXECUTION; } } // namespace FEXCore::Allocator extern "C" { // These are the glibc allocator override symbols. // These will override the glibc allocators and then check if the allocation should fault. void* fault_calloc(size_t n, size_t size) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::calloc_ptr(n, size); } void fault_free(void* ptr) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); FEXCore::Allocator::free_ptr(ptr); } void* fault_malloc(size_t size) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::malloc_ptr(size); } void* fault_memalign(size_t align, size_t s) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::memalign_ptr(align, s); } void* fault_realloc(void* ptr, size_t size) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::realloc_ptr(ptr, size); } void* fault_valloc(size_t size) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::valloc_ptr(size); } int fault_posix_memalign(void** r, size_t a, size_t s) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::posix_memalign_ptr(r, a, s); } size_t fault_malloc_usable_size(void* ptr) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::malloc_usable_size_ptr(ptr); } void* fault_aligned_alloc(size_t a, size_t s) { FEXCore::Allocator::EvaluateReturnAddress(__builtin_extract_return_addr(__builtin_return_address(0))); return FEXCore::Allocator::aligned_alloc_ptr(a, s); } } ================================================ FILE: FEXCore/Source/Utils/ArchHelpers/Arm64.cpp ================================================ // SPDX-License-Identifier: MIT #include "Interface/Core/CPUBackend.h" #include "Interface/Context/Context.h" #include "Utils/SpinWaitLock.h" #include #include #include #include #include #include #include namespace FEXCore::ArchHelpers::Arm64 { constexpr uint32_t CASPAL_MASK = 0xBF'E0'FC'00; constexpr uint32_t CASPAL_INST = 0x08'60'FC'00; constexpr uint32_t CASAL_MASK = 0x3F'E0'FC'00; constexpr uint32_t CASAL_INST = 0x08'E0'FC'00; constexpr uint32_t ATOMIC_MEM_MASK = 0x3B200C00; constexpr uint32_t ATOMIC_MEM_INST = 0x38200000; constexpr uint32_t RCPC2_MASK = 0x3F'E0'0C'00; constexpr uint32_t LDAPUR_INST = 0x19'40'00'00; constexpr uint32_t STLUR_INST = 0x19'00'00'00; constexpr uint32_t LDAXP_MASK = 0xBF'FF'80'00; constexpr uint32_t LDAXP_INST = 0x88'7F'80'00; constexpr uint32_t STLXP_MASK = 0xBF'E0'80'00; constexpr uint32_t STLXP_INST = 0x88'20'80'00; constexpr uint32_t LDAXR_MASK = 0x3F'FF'FC'00; constexpr uint32_t LDAXR_INST = 0x08'5F'FC'00; constexpr uint32_t LDAR_INST = 0x08'DF'FC'00; constexpr uint32_t LDAPR_INST = 0x38'BF'C0'00; constexpr uint32_t STLR_INST = 0x08'9F'FC'00; constexpr uint32_t STLXR_MASK = 0x3F'E0'FC'00; constexpr uint32_t STLXR_INST = 0x08'00'FC'00; // Load/store register (register offset) (Rm encoded as xzr) constexpr uint32_t LDSTREGISTER_MASK = 0b0011'1111'1111'1111'1111'1100'0000'0000; constexpr uint32_t LDR_INST = 0b0011'1000'0111'1111'0110'1000'0000'0000; constexpr uint32_t STR_INST = 0b0011'1000'0011'1111'0110'1000'0000'0000; constexpr uint32_t LDSTUNSCALED_MASK = 0b0011'1011'1110'0000'0000'1100'0000'0000; constexpr uint32_t LDUR_INST = 0b0011'1000'0100'0000'0000'0000'0000'0000; constexpr uint32_t STUR_INST = 0b0011'1000'0000'0000'0000'0000'0000'0000; constexpr uint32_t LDSTP_MASK = 0b0011'1011'1000'0000'0000'0000'0000'0000; constexpr uint32_t STP_INST = 0b0010'1001'0000'0000'0000'0000'0000'0000; constexpr uint32_t CBNZ_MASK = 0x7F'00'00'00; constexpr uint32_t CBNZ_INST = 0x35'00'00'00; constexpr uint32_t ALU_OP_MASK = 0x7F'20'00'00; constexpr uint32_t ADD_INST = 0x0B'00'00'00; constexpr uint32_t SUB_INST = 0x4B'00'00'00; constexpr uint32_t ADD_SHIFT_INST = 0x0B'20'00'00; constexpr uint32_t SUB_SHIFT_INST = 0x4B'20'00'00; constexpr uint32_t CMP_INST = 0x6B'00'00'00; constexpr uint32_t CMP_SHIFT_INST = 0x6B'20'00'00; constexpr uint32_t AND_INST = 0x0A'00'00'00; constexpr uint32_t BIC_INST = 0x0A'20'00'00; constexpr uint32_t OR_INST = 0x2A'00'00'00; constexpr uint32_t ORN_INST = 0x2A'20'00'00; constexpr uint32_t EOR_INST = 0x4A'00'00'00; constexpr uint32_t EON_INST = 0x4A'20'00'00; constexpr uint32_t CCMP_MASK = 0x7F'E0'0C'10; constexpr uint32_t CCMP_INST = 0x7A'40'00'00; constexpr uint32_t CLREX_MASK = 0xFF'FF'F0'FF; constexpr uint32_t CLREX_INST = 0xD5'03'30'5F; enum ExclusiveAtomicPairType { TYPE_SWAP, TYPE_ADD, TYPE_SUB, TYPE_AND, TYPE_BIC, TYPE_OR, TYPE_ORN, TYPE_EOR, TYPE_EON, TYPE_NEG, // This is just a sub with zero. Need to know the differences }; // Load ops are 4 bits // Acquire and release bits are independent on the instruction constexpr uint32_t ATOMIC_ADD_OP = 0b0000; constexpr uint32_t ATOMIC_CLR_OP = 0b0001; constexpr uint32_t ATOMIC_EOR_OP = 0b0010; constexpr uint32_t ATOMIC_SET_OP = 0b0011; constexpr uint32_t ATOMIC_SWAP_OP = 0b1000; constexpr uint32_t REGISTER_MASK = 0b11111; constexpr uint32_t RD_OFFSET = 0; constexpr uint32_t RN_OFFSET = 5; constexpr uint32_t RM_OFFSET = 16; constexpr uint32_t DMB = 0b1101'0101'0000'0011'0011'0000'1011'1111 | 0b1011'0000'0000; // Inner shareable all constexpr uint32_t DMB_LD = 0b1101'0101'0000'0011'0011'0000'1011'1111 | 0b1101'0000'0000; // Inner shareable load static constexpr uint32_t GetRdReg(uint32_t Instr) { return (Instr >> RD_OFFSET) & REGISTER_MASK; } static constexpr uint32_t GetRnReg(uint32_t Instr) { return (Instr >> RN_OFFSET) & REGISTER_MASK; } static constexpr uint32_t GetRmReg(uint32_t Instr) { return (Instr >> RM_OFFSET) & REGISTER_MASK; } static void ClearICache(void* Begin, std::size_t Length) { __builtin___clear_cache(static_cast(Begin), static_cast(Begin) + Length); } static __uint128_t LoadAcquire128(uint64_t Addr) { __uint128_t Result {}; uint64_t Lower; uint64_t Upper; // This specifically avoids using std::atomic<__uint128_t> // std::atomic helper does a ldaxp + stxp pair that crashes when the page is only mapped readable __asm volatile( R"( ldaxp %[ResultLower], %[ResultUpper], [%[Addr]]; clrex; )" : [ResultLower] "=r"(Lower), [ResultUpper] "=r"(Upper) : [Addr] "r"(Addr) : "memory"); Result = Upper; Result <<= 64; Result |= Lower; return Result; } static uint64_t LoadAcquire64(uint64_t Addr) { auto Atom = std::atomic_ref(*reinterpret_cast(Addr)); return Atom.load(std::memory_order_acquire); } static bool StoreCAS64(uint64_t& Expected, uint64_t Val, uint64_t Addr) { auto Atom = std::atomic_ref(*reinterpret_cast(Addr)); return Atom.compare_exchange_strong(Expected, Val); } static uint32_t LoadAcquire32(uint64_t Addr) { auto Atom = std::atomic_ref(*reinterpret_cast(Addr)); return Atom.load(std::memory_order_acquire); } static bool StoreCAS32(uint32_t& Expected, uint32_t Val, uint64_t Addr) { auto Atom = std::atomic_ref(*reinterpret_cast(Addr)); return Atom.compare_exchange_strong(Expected, Val); } static uint8_t LoadAcquire8(uint64_t Addr) { auto Atom = std::atomic_ref(*reinterpret_cast(Addr)); return Atom.load(std::memory_order_acquire); } static bool StoreCAS8(uint8_t& Expected, uint8_t Val, uint64_t Addr) { auto Atom = std::atomic_ref(*reinterpret_cast(Addr)); return Atom.compare_exchange_strong(Expected, Val); } static uint16_t DoLoad16(uint64_t Addr) { uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) == 15) { // Address crosses over 16byte or 64byte threshold // Needs two loads uint64_t AddrUpper = Addr + 1; uint8_t ActualUpper {}; uint8_t ActualLower {}; // Careful ordering here ActualUpper = LoadAcquire8(AddrUpper); ActualLower = LoadAcquire8(Addr); uint16_t Result = ActualUpper; Result <<= 8; Result |= ActualLower; return Result; } else { AlignmentMask = 0b111; if ((Addr & AlignmentMask) == 7) { // Crosses 8byte boundary // Needs 128bit load // Fits within a 16byte region uint64_t Alignment = Addr & 0b1111; Addr &= ~0b1111ULL; __uint128_t TmpResult = LoadAcquire128(Addr); // Zexts the result uint16_t Result = TmpResult >> (Alignment * 8); return Result; } else { AlignmentMask = 0b11; if ((Addr & AlignmentMask) == 3) { // Crosses 4byte boundary // Needs 64bit Load uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; auto Atomic = std::atomic_ref(*reinterpret_cast(Addr)); uint64_t TmpResult = Atomic.load(); // Zexts the result uint16_t Result = TmpResult >> (Alignment * 8); return Result; } else { // Fits within 4byte boundary // Only needs 32bit Load // Only alignment offset will be 1 here uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; auto Atomic = std::atomic_ref(*reinterpret_cast(Addr)); uint32_t TmpResult = Atomic.load(); // Zexts the result uint16_t Result = TmpResult >> (Alignment * 8); return Result; } } } } static uint32_t DoLoad32(uint64_t Addr) { uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) > 12) { // Address crosses over 16byte threshold // Needs dual 32bit load uint64_t Alignment = Addr & 0b11; Addr &= ~0b11ULL; uint64_t AddrUpper = Addr + 4; // Careful ordering here uint32_t ActualUpper = LoadAcquire32(AddrUpper); uint32_t ActualLower = LoadAcquire32(Addr); uint64_t Result = ActualUpper; Result <<= 32; Result |= ActualLower; return Result >> (Alignment * 8); } else { AlignmentMask = 0b111; if ((Addr & AlignmentMask) >= 5) { // Crosses 8byte boundary // Needs 128bit load // Fits within a 16byte region uint64_t Alignment = Addr & 0b1111; Addr &= ~0b1111ULL; __uint128_t TmpResult = LoadAcquire128(Addr); return TmpResult >> (Alignment * 8); } else { // Fits within 8byte boundary // Only needs 64bit CAS // Alignments can be [1,5) uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; auto Atomic = std::atomic_ref(*reinterpret_cast(Addr)); uint64_t TmpResult = Atomic.load(); return TmpResult >> (Alignment * 8); } } } static uint64_t DoLoad64(uint64_t Addr) { uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) > 8) { uint64_t Alignment = Addr & 0b111; Addr &= ~0b111ULL; uint64_t AddrUpper = Addr + 8; // Crosses a 16byte boundary // Needs two 8 byte loads uint64_t ActualUpper {}; uint64_t ActualLower {}; // Careful ordering here ActualUpper = LoadAcquire64(AddrUpper); ActualLower = LoadAcquire64(Addr); __uint128_t Result = ActualUpper; Result <<= 64; Result |= ActualLower; return Result >> (Alignment * 8); } else { // Fits within a 16byte region uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; __uint128_t TmpResult = LoadAcquire128(Addr); uint64_t Result = TmpResult >> (Alignment * 8); return Result; } } static __uint128_t DoLoad128(uint64_t Addr) { // Any misalignment here means we cross a 16byte boundary // So we need two 128bit loads uint64_t Alignment = Addr & 0b1111; Addr &= ~0b1111ULL; uint64_t AddrUpper = Addr + 16; union AlignedData { struct { __uint128_t Lower; __uint128_t Upper; } Large; struct { uint8_t Data[32]; } Bytes; }; AlignedData* Data = reinterpret_cast(alloca(sizeof(AlignedData))); Data->Large.Upper = LoadAcquire128(AddrUpper); Data->Large.Lower = LoadAcquire128(Addr); __uint128_t Result {}; memcpy(&Result, &Data->Bytes.Data[Alignment], sizeof(Result)); return Result; } static bool RunCASPAL(uint64_t* GPRs, uint32_t Size, uint32_t DesiredReg1, uint32_t DesiredReg2, uint32_t ExpectedReg1, uint32_t ExpectedReg2, uint32_t AddressReg, uint32_t* StrictSplitLockMutex) { std::optional> Lock {}; if (Size == 0) { // 32bit uint64_t Addr = GPRs[AddressReg]; uint32_t DesiredLower = GPRs[DesiredReg1]; uint32_t DesiredUpper = GPRs[DesiredReg2]; uint32_t ExpectedLower = GPRs[ExpectedReg1]; uint32_t ExpectedUpper = GPRs[ExpectedReg2]; // Cross-cacheline CAS doesn't work on ARM // It isn't even guaranteed to work on x86 // Intel will do a "split lock" which locks the full bus // AMD will tear instead // Both cross-cacheline and cross 16byte both need dual CAS loops that can tear // ARMv8.4 LSE2 solves all atomic issues except cross-cacheline // Check for Split lock across a cacheline if ((Addr & 63) > 56) { FEXCORE_TELEMETRY_SET(TYPE_HAS_SPLIT_LOCKS, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } } uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) > 8) { FEXCORE_TELEMETRY_SET(TYPE_16BYTE_SPLIT, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } uint64_t Alignment = Addr & 0b111; Addr &= ~0b111ULL; uint64_t AddrUpper = Addr + 8; // Crosses a 16byte boundary // Need to do 256bit atomic, but since that doesn't exist we need to do a dual CAS loop __uint128_t Mask = ~0ULL; Mask <<= Alignment * 8; __uint128_t NegMask = ~Mask; __uint128_t TmpExpected {}; __uint128_t TmpDesired {}; __uint128_t Desired = DesiredUpper; Desired <<= 32; Desired |= DesiredLower; Desired <<= Alignment * 8; __uint128_t Expected = ExpectedUpper; Expected <<= 32; Expected |= ExpectedLower; Expected <<= Alignment * 8; while (1) { __uint128_t LoadOrderUpper = LoadAcquire64(AddrUpper); LoadOrderUpper <<= 64; __uint128_t TmpActual = LoadOrderUpper | LoadAcquire64(Addr); // Set up expected TmpExpected = TmpActual; TmpExpected &= NegMask; TmpExpected |= Expected; // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired; uint64_t TmpExpectedLower = TmpExpected; uint64_t TmpExpectedUpper = TmpExpected >> 64; uint64_t TmpDesiredLower = TmpDesired; uint64_t TmpDesiredUpper = TmpDesired >> 64; if (TmpExpected == TmpActual) { if (StoreCAS64(TmpExpectedUpper, TmpDesiredUpper, AddrUpper)) { if (StoreCAS64(TmpExpectedLower, TmpDesiredLower, Addr)) { // Stored successfully return true; } else { // CAS managed to tear, we can't really solve this // Continue down the path to let the guest know values weren't expected FEXCORE_TELEMETRY_SET(TYPE_CAS_128BIT_TEAR, 1); } } TmpExpected = TmpExpectedUpper; TmpExpected <<= 64; TmpExpected |= TmpExpectedLower; } else { // Mismatch up front TmpExpected = TmpActual; } // Not successful // Now we need to check the results to see if we need to try again __uint128_t FailedResultOurBits = TmpExpected & Mask; __uint128_t FailedResultNotOurBits = TmpExpected & NegMask; __uint128_t FailedDesiredOurBits = TmpDesired & Mask; __uint128_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } if ((FailedResultOurBits ^ FailedDesiredOurBits) != 0) { // If the bits changed that we were wanting to change then we have failed and can return // We need to extract the bits and return them in EXPECTED uint64_t FailedResult = FailedResultOurBits >> (Alignment * 8); GPRs[ExpectedReg1] = FailedResult & ~0U; GPRs[ExpectedReg2] = FailedResult >> 32; return true; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint64_t FailedResult = FailedResultOurBits >> (Alignment * 8); GPRs[ExpectedReg1] = FailedResult & ~0U; GPRs[ExpectedReg2] = FailedResult >> 32; return true; } } else { // Fits within a 16byte region uint64_t Alignment = Addr & 0b1111; Addr &= ~0b1111ULL; auto Atomic128 = std::atomic_ref<__uint128_t>(*reinterpret_cast<__uint128_t*>(Addr)); __uint128_t Mask = ~0ULL; Mask <<= Alignment * 8; __uint128_t NegMask = ~Mask; __uint128_t TmpExpected {}; __uint128_t TmpDesired {}; __uint128_t Desired = (uint64_t)DesiredUpper << 32 | DesiredLower; Desired <<= Alignment * 8; __uint128_t Expected = (uint64_t)ExpectedUpper << 32 | ExpectedLower; Expected <<= Alignment * 8; while (1) { TmpExpected = Atomic128.load(); // Set up expected TmpExpected &= NegMask; TmpExpected |= Expected; // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired; bool CASResult = Atomic128.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Successful, so we are done return true; } else { // Not successful // Now we need to check the results to see if we need to try again __uint128_t FailedResultOurBits = TmpExpected & Mask; __uint128_t FailedResultNotOurBits = TmpExpected & NegMask; __uint128_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint64_t FailedResult = FailedResultOurBits >> (Alignment * 8); GPRs[ExpectedReg1] = FailedResult & ~0U; GPRs[ExpectedReg2] = FailedResult >> 32; return true; } } } } return false; } static bool HandleCASPAL(uint32_t Instr, uint64_t* GPRs, uint32_t* StrictSplitLockMutex) { uint32_t Size = (Instr >> 30) & 1; uint32_t DesiredReg1 = Instr & 0b11111; uint32_t DesiredReg2 = DesiredReg1 + 1; uint32_t ExpectedReg1 = (Instr >> 16) & 0b11111; uint32_t ExpectedReg2 = ExpectedReg1 + 1; uint32_t AddressReg = (Instr >> 5) & 0b11111; return RunCASPAL(GPRs, Size, DesiredReg1, DesiredReg2, ExpectedReg1, ExpectedReg2, AddressReg, StrictSplitLockMutex); } static uint64_t HandleCASPAL_ARMv8(uint32_t Instr, uintptr_t ProgramCounter, uint64_t* GPRs, uint32_t* StrictSplitLockMutex) { // caspair // [1] ldaxp(TMP2.W(), TMP3.W(), MemOperand(MemSrc)); <-- DataReg & AddrReg // [2] cmp(TMP2.W(), Expected.first.W()); <-- ExpectedReg1 // [3] ccmp(TMP3.W(), Expected.second.W(), NoFlag, Condition::eq); <-- ExpectedREg2 // [4] b(&LoopNotExpected, Condition::ne); // [5] stlxp(TMP2.W(), Desired.first.W(), Desired.second.W(), MemOperand(MemSrc)); <-- DesiredReg // [6] cbnz(TMP2.W(), &LoopTop); // [7] mov(Dst.first.W(), Expected.first.W()); // [8] mov(Dst.second.W(), Expected.second.W()); // [9] b(&LoopExpected); // [10] mov(Dst.first.W(), TMP2.W()); // [11] mov(Dst.second.W(), TMP3.W()); // [12] clrex(); uint32_t* PC = (uint32_t*)ProgramCounter; uint32_t Size = (Instr >> 30) & 1; uint32_t AddrReg = (Instr >> 5) & 0x1F; uint32_t DataReg = Instr & 0x1F; uint32_t DataReg2 = (Instr >> 10) & 0x1F; uint32_t ExpectedReg1 {}; uint32_t ExpectedReg2 {}; uint32_t DesiredReg1 {}; uint32_t DesiredReg2 {}; if (Size == 1) { // 64-bit pair happens on paranoid vector loads // [1] ldaxp(TMP1, TMP2, MemSrc); // [2] clrex(); // // 64-bit pair happens on paranoid vector stores // [1] ldaxp(xzr, TMP3, MemSrc); // <- Can hit SIGBUS // [2] stlxp(TMP3, TMP1, TMP2, MemSrc); // <- Can also hit SIGBUS // [3] cbnz(TMP3, &B); // < Overwritten with DMB if (DataReg == 31) { } else { uint32_t NextInstr = PC[1]; if ((NextInstr & ArchHelpers::Arm64::CLREX_MASK) == ArchHelpers::Arm64::CLREX_INST) { uint64_t Addr = GPRs[AddrReg]; auto Res = DoLoad128(Addr); // We set the result register if it isn't a zero register if (DataReg != 31) { GPRs[DataReg] = Res; } if (DataReg2 != 31) { GPRs[DataReg2] = Res >> 64; } // Skip ldaxp and clrex return 2 * sizeof(uint32_t); } } return 0; } // Only 32-bit pairs for (int i = 1; i < 10; i++) { uint32_t NextInstr = PC[i]; if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::CMP_INST || (NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::CMP_SHIFT_INST) { ExpectedReg1 = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::CCMP_MASK) == ArchHelpers::Arm64::CCMP_INST) { ExpectedReg2 = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::STLXP_MASK) == ArchHelpers::Arm64::STLXP_INST) { DesiredReg1 = (NextInstr & 0x1F); DesiredReg2 = (NextInstr >> 10) & 0x1F; } } // mov expected into the temp registers used by JIT GPRs[DataReg] = GPRs[ExpectedReg1]; GPRs[DataReg2] = GPRs[ExpectedReg2]; if (RunCASPAL(GPRs, Size, DesiredReg1, DesiredReg2, DataReg, DataReg2, AddrReg, StrictSplitLockMutex)) { return 9 * sizeof(uint32_t); // skip to mov + clrex } else { return 0; } } template using CASExpectedFn = T (*)(T Src, T Expected); template using CASDesiredFn = T (*)(T Src, T Desired); template static uint16_t DoCAS16(uint16_t DesiredSrc, uint16_t ExpectedSrc, uint64_t Addr, CASExpectedFn ExpectedFunction, CASDesiredFn DesiredFunction, uint32_t* StrictSplitLockMutex) { std::optional> Lock {}; if ((Addr & 63) == 63) { FEXCORE_TELEMETRY_SET(TYPE_HAS_SPLIT_LOCKS, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } } // 16 bit uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) == 15) { FEXCORE_TELEMETRY_SET(TYPE_16BYTE_SPLIT, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } // Address crosses over 16byte or 64byte threshold // Need a dual 8bit CAS loop uint64_t AddrUpper = Addr + 1; while (1) { uint8_t ActualUpper {}; uint8_t ActualLower {}; // Careful ordering here ActualUpper = LoadAcquire8(AddrUpper); ActualLower = LoadAcquire8(Addr); uint16_t Actual = ActualUpper; Actual <<= 8; Actual |= ActualLower; uint16_t Desired = DesiredFunction(Actual, DesiredSrc); uint8_t DesiredLower = Desired; uint8_t DesiredUpper = Desired >> 8; uint16_t Expected = ExpectedFunction(Actual, ExpectedSrc); uint8_t ExpectedLower = Expected; uint8_t ExpectedUpper = Expected >> 8; bool Tear = false; if (ActualUpper == ExpectedUpper && ActualLower == ExpectedLower) { if (StoreCAS8(ExpectedUpper, DesiredUpper, AddrUpper)) { if (StoreCAS8(ExpectedLower, DesiredLower, Addr)) { // Stored successfully return Expected; } else { // CAS managed to tear, we can't really solve this // Continue down the path to let the guest know values weren't expected Tear = true; FEXCORE_TELEMETRY_SET(TYPE_CAS_16BIT_TEAR, 1); } } ActualLower = ExpectedLower; } // If the bits changed that we were wanting to change then we have failed and can return // We need to extract the bits and return them in EXPECTED uint16_t FailedResult = ActualUpper; FailedResult <<= 8; FailedResult |= ActualLower; if constexpr (Retry) { if (Tear) { // If we are retrying and tearing then we can't do anything here // XXX: Resolve with TME return FailedResult; } else { // We can retry safely } } else { // Without Retry (CAS) then we have failed regardless of tear // CAS failed but handled successfully return FailedResult; } } } else { AlignmentMask = 0b111; if ((Addr & AlignmentMask) == 7) { // Crosses 8byte boundary // Needs 128bit CAS // Fits within a 16byte region uint64_t Alignment = Addr & 0b1111; Addr &= ~0b1111ULL; auto Atomic128 = std::atomic_ref<__uint128_t>(*reinterpret_cast<__uint128_t*>(Addr)); __uint128_t Mask = 0xFFFF; Mask <<= Alignment * 8; __uint128_t NegMask = ~Mask; __uint128_t TmpExpected {}; __uint128_t TmpDesired {}; while (1) { TmpExpected = Atomic128.load(); __uint128_t Desired = DesiredFunction(TmpExpected >> (Alignment * 8), DesiredSrc); Desired <<= Alignment * 8; __uint128_t Expected = ExpectedFunction(TmpExpected >> (Alignment * 8), ExpectedSrc); Expected <<= Alignment * 8; // Set up expected TmpExpected &= NegMask; TmpExpected |= Expected; // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired; bool CASResult = Atomic128.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Successful, so we are done return Expected >> (Alignment * 8); } else { if constexpr (Retry) { // If we failed but we have enabled retry then just retry without checking results // CAS can't retry but atomic memory ops need to retry until passing continue; } // Not successful // Now we need to check the results to see if we need to try again __uint128_t FailedResultOurBits = TmpExpected & Mask; __uint128_t FailedResultNotOurBits = TmpExpected & NegMask; __uint128_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint16_t FailedResult = FailedResultOurBits >> (Alignment * 8); // CAS failed but handled successfully return FailedResult; } } } else { AlignmentMask = 0b11; if ((Addr & AlignmentMask) == 3) { // Crosses 4byte boundary // Needs 64bit CAS uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; uint64_t Mask = 0xFFFF; Mask <<= Alignment * 8; uint64_t NegMask = ~Mask; uint64_t TmpExpected {}; uint64_t TmpDesired {}; auto Atomic = std::atomic_ref(*reinterpret_cast(Addr)); while (1) { TmpExpected = Atomic.load(); uint64_t Desired = DesiredFunction(TmpExpected >> (Alignment * 8), DesiredSrc); Desired <<= Alignment * 8; uint64_t Expected = ExpectedFunction(TmpExpected >> (Alignment * 8), ExpectedSrc); Expected <<= Alignment * 8; // Set up expected TmpExpected &= NegMask; TmpExpected |= Expected; // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired; bool CASResult = Atomic.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Successful, so we are done return Expected >> (Alignment * 8); } else { if constexpr (Retry) { // If we failed but we have enabled retry then just retry without checking results // CAS can't retry but atomic memory ops need to retry until passing continue; } // Not successful // Now we need to check the results to see if we can try again uint64_t FailedResultOurBits = TmpExpected & Mask; uint64_t FailedResultNotOurBits = TmpExpected & NegMask; uint64_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint16_t FailedResult = FailedResultOurBits >> (Alignment * 8); // CAS failed but handled successfully return FailedResult; } } } else { // Fits within 4byte boundary // Only needs 32bit CAS // Only alignment offset will be 1 here uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; uint32_t Mask = 0xFFFF; Mask <<= Alignment * 8; uint32_t NegMask = ~Mask; uint32_t TmpExpected {}; uint32_t TmpDesired {}; auto Atomic = std::atomic_ref(*reinterpret_cast(Addr)); while (1) { TmpExpected = Atomic.load(); uint32_t Desired = DesiredFunction(TmpExpected >> (Alignment * 8), DesiredSrc); Desired <<= Alignment * 8; uint32_t Expected = ExpectedFunction(TmpExpected >> (Alignment * 8), ExpectedSrc); Expected <<= Alignment * 8; // Set up expected TmpExpected &= NegMask; TmpExpected |= Expected; // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired; bool CASResult = Atomic.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Successful, so we are done return Expected >> (Alignment * 8); } else { if constexpr (Retry) { // If we failed but we have enabled retry then just retry without checking results // CAS can't retry but atomic memory ops need to retry until passing continue; } // Not successful // Now we need to check the results to see if we can try again uint32_t FailedResultOurBits = TmpExpected & Mask; uint32_t FailedResultNotOurBits = TmpExpected & NegMask; uint32_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint16_t FailedResult = FailedResultOurBits >> (Alignment * 8); // CAS failed but handled successfully return FailedResult; } } } } } } template static uint32_t DoCAS32(uint32_t DesiredSrc, uint32_t ExpectedSrc, uint64_t Addr, CASExpectedFn ExpectedFunction, CASDesiredFn DesiredFunction, uint32_t* StrictSplitLockMutex) { std::optional> Lock {}; if ((Addr & 63) > 60) { FEXCORE_TELEMETRY_SET(TYPE_HAS_SPLIT_LOCKS, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } } // 32 bit uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) > 12) { FEXCORE_TELEMETRY_SET(TYPE_16BYTE_SPLIT, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } // Address crosses over 16byte threshold // Needs dual 4 byte CAS loop uint64_t Alignment = Addr & 0b11; Addr &= ~0b11; uint64_t AddrUpper = Addr + 4; uint64_t Mask = ~0U; Mask <<= Alignment * 8; uint64_t NegMask = ~Mask; // Careful ordering here while (1) { uint64_t LoadOrderUpper = LoadAcquire32(AddrUpper); LoadOrderUpper <<= 32; uint64_t TmpActual = LoadOrderUpper | LoadAcquire32(Addr); uint64_t Desired = DesiredFunction(TmpActual >> (Alignment * 8), DesiredSrc); uint64_t Expected = ExpectedFunction(TmpActual >> (Alignment * 8), ExpectedSrc); uint64_t TmpExpected = TmpActual; TmpExpected &= NegMask; TmpExpected |= Expected << (Alignment * 8); uint64_t TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired << (Alignment * 8); bool Tear = false; if (TmpExpected == TmpActual) { uint32_t TmpExpectedLower = TmpExpected; uint32_t TmpExpectedUpper = TmpExpected >> 32; uint32_t TmpDesiredLower = TmpDesired; uint32_t TmpDesiredUpper = TmpDesired >> 32; if (StoreCAS32(TmpExpectedUpper, TmpDesiredUpper, AddrUpper)) { if (StoreCAS32(TmpExpectedLower, TmpDesiredLower, Addr)) { // Stored successfully return Expected; } else { // CAS managed to tear, we can't really solve this // Continue down the path to let the guest know values weren't expected Tear = true; FEXCORE_TELEMETRY_SET(TYPE_CAS_32BIT_TEAR, 1); } } TmpExpected = TmpExpectedUpper; TmpExpected <<= 32; TmpExpected |= TmpExpectedLower; } else { // Mismatch up front TmpExpected = TmpActual; } // Not successful // Now we need to check the results to see if we need to try again uint64_t FailedResultOurBits = TmpExpected & Mask; uint64_t FailedResultNotOurBits = TmpExpected & NegMask; uint64_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint32_t FailedResult = FailedResultOurBits >> (Alignment * 8); if constexpr (Retry) { if (Tear) { // If we are retrying and tearing then we can't do anything here // XXX: Resolve with TME return FailedResult; } else { // We can retry safely } } else { // Without Retry (CAS) then we have failed regardless of tear // CAS failed but handled successfully return FailedResult; } } } else { AlignmentMask = 0b111; if ((Addr & AlignmentMask) >= 5) { // Crosses 8byte boundary // Needs 128bit CAS // Fits within a 16byte region uint64_t Alignment = Addr & 0b1111; Addr &= ~0b1111ULL; auto Atomic128 = std::atomic_ref<__uint128_t>(*reinterpret_cast<__uint128_t*>(Addr)); __uint128_t Mask = ~0U; Mask <<= Alignment * 8; __uint128_t NegMask = ~Mask; __uint128_t TmpExpected {}; __uint128_t TmpDesired {}; while (1) { __uint128_t TmpActual = Atomic128.load(); __uint128_t Desired = DesiredFunction(TmpActual >> (Alignment * 8), DesiredSrc); __uint128_t Expected = ExpectedFunction(TmpActual >> (Alignment * 8), ExpectedSrc); // Set up expected TmpExpected = TmpActual; TmpExpected &= NegMask; TmpExpected |= Expected << (Alignment * 8); // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired << (Alignment * 8); bool CASResult = Atomic128.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Stored successfully return Expected; } else { if constexpr (Retry) { // If we failed but we have enabled retry then just retry without checking results // CAS can't retry but atomic memory ops need to retry until passing continue; } // Not successful // Now we need to check the results to see if we need to try again __uint128_t FailedResultOurBits = TmpExpected & Mask; __uint128_t FailedResultNotOurBits = TmpExpected & NegMask; __uint128_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint32_t FailedResult = FailedResultOurBits >> (Alignment * 8); // CAS failed but handled successfully return FailedResult; } } } else { // Fits within 8byte boundary // Only needs 64bit CAS // Alignments can be [1,5) uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; uint64_t Mask = ~0U; Mask <<= Alignment * 8; uint64_t NegMask = ~Mask; uint64_t TmpExpected {}; uint64_t TmpDesired {}; auto Atomic = std::atomic_ref(*reinterpret_cast(Addr)); while (1) { uint64_t TmpActual = Atomic.load(); uint64_t Desired = DesiredFunction(TmpActual >> (Alignment * 8), DesiredSrc); uint64_t Expected = ExpectedFunction(TmpActual >> (Alignment * 8), ExpectedSrc); // Set up expected TmpExpected = TmpActual; TmpExpected &= NegMask; TmpExpected |= Expected << (Alignment * 8); // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired << (Alignment * 8); bool CASResult = Atomic.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Stored successfully return Expected; } else { if constexpr (Retry) { // If we failed but we have enabled retry then just retry without checking results // CAS can't retry but atomic memory ops need to retry until passing continue; } // Not successful // Now we need to check the results to see if we can try again uint64_t FailedResultOurBits = TmpExpected & Mask; uint64_t FailedResultNotOurBits = TmpExpected & NegMask; uint64_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint32_t FailedResult = FailedResultOurBits >> (Alignment * 8); // CAS failed but handled successfully return FailedResult; } } } } } template static uint64_t DoCAS64(uint64_t DesiredSrc, uint64_t ExpectedSrc, uint64_t Addr, CASExpectedFn ExpectedFunction, CASDesiredFn DesiredFunction, uint32_t* StrictSplitLockMutex) { std::optional> Lock {}; if ((Addr & 63) > 56) { FEXCORE_TELEMETRY_SET(TYPE_HAS_SPLIT_LOCKS, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } } // 64bit uint64_t AlignmentMask = 0b1111; if ((Addr & AlignmentMask) > 8) { FEXCORE_TELEMETRY_SET(TYPE_16BYTE_SPLIT, 1); if (StrictSplitLockMutex && !Lock.has_value()) { Lock.emplace(StrictSplitLockMutex); } uint64_t Alignment = Addr & 0b111; Addr &= ~0b111ULL; uint64_t AddrUpper = Addr + 8; // Crosses a 16byte boundary // Need to do 256bit atomic, but since that doesn't exist we need to do a dual CAS loop __uint128_t Mask = ~0ULL; Mask <<= Alignment * 8; __uint128_t NegMask = ~Mask; __uint128_t TmpExpected {}; __uint128_t TmpDesired {}; while (1) { __uint128_t LoadOrderUpper = LoadAcquire64(AddrUpper); LoadOrderUpper <<= 64; __uint128_t TmpActual = LoadOrderUpper | LoadAcquire64(Addr); __uint128_t Desired = DesiredFunction(TmpActual >> (Alignment * 8), DesiredSrc); __uint128_t Expected = ExpectedFunction(TmpActual >> (Alignment * 8), ExpectedSrc); // Set up expected TmpExpected = TmpActual; TmpExpected &= NegMask; TmpExpected |= Expected << (Alignment * 8); // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired << (Alignment * 8); uint64_t TmpExpectedLower = TmpExpected; uint64_t TmpExpectedUpper = TmpExpected >> 64; uint64_t TmpDesiredLower = TmpDesired; uint64_t TmpDesiredUpper = TmpDesired >> 64; bool Tear = false; if (TmpExpected == TmpActual) { if (StoreCAS64(TmpExpectedUpper, TmpDesiredUpper, AddrUpper)) { if (StoreCAS64(TmpExpectedLower, TmpDesiredLower, Addr)) { // Stored successfully return Expected; } else { // CAS managed to tear, we can't really solve this // Continue down the path to let the guest know values weren't expected Tear = true; FEXCORE_TELEMETRY_SET(TYPE_CAS_64BIT_TEAR, 1); } } TmpExpected = TmpExpectedUpper; TmpExpected <<= 64; TmpExpected |= TmpExpectedLower; } else { // Mismatch up front TmpExpected = TmpActual; } // Not successful // Now we need to check the results to see if we need to try again __uint128_t FailedResultOurBits = TmpExpected & Mask; __uint128_t FailedResultNotOurBits = TmpExpected & NegMask; __uint128_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint64_t FailedResult = FailedResultOurBits >> (Alignment * 8); if constexpr (Retry) { if (Tear) { // If we are retrying and tearing then we can't do anything here // XXX: Resolve with TME return FailedResult; } else { // We can retry safely } } else { // Without Retry (CAS) then we have failed regardless of tear // CAS failed but handled successfully return FailedResult; } } } else { // Fits within a 16byte region uint64_t Alignment = Addr & AlignmentMask; Addr &= ~AlignmentMask; auto Atomic128 = std::atomic_ref<__uint128_t>(*reinterpret_cast<__uint128_t*>(Addr)); __uint128_t Mask = ~0ULL; Mask <<= Alignment * 8; __uint128_t NegMask = ~Mask; __uint128_t TmpExpected {}; __uint128_t TmpDesired {}; while (1) { __uint128_t TmpActual = Atomic128.load(); __uint128_t Desired = DesiredFunction(TmpActual >> (Alignment * 8), DesiredSrc); __uint128_t Expected = ExpectedFunction(TmpActual >> (Alignment * 8), ExpectedSrc); // Set up expected TmpExpected = TmpActual; TmpExpected &= NegMask; TmpExpected |= Expected << (Alignment * 8); // Set up desired TmpDesired = TmpExpected; TmpDesired &= NegMask; TmpDesired |= Desired << (Alignment * 8); bool CASResult = Atomic128.compare_exchange_strong(TmpExpected, TmpDesired); if (CASResult) { // Stored successfully return Expected; } else { if constexpr (Retry) { // If we failed but we have enabled retry then just retry without checking results // CAS can't retry but atomic memory ops need to retry until passing continue; } // Not successful // Now we need to check the results to see if we need to try again __uint128_t FailedResultOurBits = TmpExpected & Mask; __uint128_t FailedResultNotOurBits = TmpExpected & NegMask; __uint128_t FailedDesiredNotOurBits = TmpDesired & NegMask; if ((FailedResultNotOurBits ^ FailedDesiredNotOurBits) != 0) { // If the bits changed that weren't part of our regular CAS then we need to try again continue; } // This happens in the case that between Load and CAS that something has store our desired in to the memory location // This means our CAS fails because what we wanted to store was already stored uint64_t FailedResult = FailedResultOurBits >> (Alignment * 8); // CAS failed but handled successfully return FailedResult; } } } } static std::optional DoCAS(uint32_t Size, uint64_t Desired, uint64_t Expected, uint64_t Addr, uint32_t* StrictSplitLockMutex) { // Cross-cacheline CAS doesn't work on ARM // It isn't even guaranteed to work on x86 // Intel will do a "split lock" which locks the full bus // AMD will tear instead // Both cross-cacheline and cross 16byte both need dual CAS loops that can tear // ARMv8.4 LSE2 solves all atomic issues except cross-cacheline // ARM's TME extension solves the cross-cacheline problem // 8bit can't be unaligned // Only need to handle 16, 32, 64 if (Size == 2) { auto Res = DoCAS16( Desired, Expected, Addr, [](uint16_t, uint16_t Expected) -> uint16_t { // Expected is just Expected return Expected; }, [](uint16_t, uint16_t Desired) -> uint16_t { // Desired is just Desired return Desired; }, StrictSplitLockMutex); return Res; } else if (Size == 4) { auto Res = DoCAS32( Desired, Expected, Addr, [](uint32_t, uint32_t Expected) -> uint32_t { // Expected is just Expected return Expected; }, [](uint32_t, uint32_t Desired) -> uint32_t { // Desired is just Desired return Desired; }, StrictSplitLockMutex); return Res; } else if (Size == 8) { auto Res = DoCAS64( Desired, Expected, Addr, [](uint64_t, uint64_t Expected) -> uint64_t { // Expected is just Expected return Expected; }, [](uint64_t, uint64_t Desired) -> uint64_t { // Desired is just Desired return Desired; }, StrictSplitLockMutex); return Res; } return std::nullopt; } static bool RunCASAL(uint64_t* GPRs, uint32_t Size, uint32_t DesiredReg, uint32_t ExpectedReg, uint32_t AddressReg, uint32_t* StrictSplitLockMutex) { std::optional Res = DoCAS(Size, GPRs[DesiredReg], GPRs[ExpectedReg], GPRs[AddressReg], StrictSplitLockMutex); if (!Res.has_value()) { return false; } // Regardless of pass or fail // We set the result register if it isn't a zero register if (ExpectedReg != 31) { GPRs[ExpectedReg] = *Res; } return true; } static bool HandleCASAL(uint64_t* GPRs, uint32_t Instr, uint32_t* StrictSplitLockMutex) { uint32_t Size = 1 << (Instr >> 30); uint32_t DesiredReg = Instr & 0b11111; uint32_t ExpectedReg = (Instr >> 16) & 0b11111; uint32_t AddressReg = (Instr >> 5) & 0b11111; return RunCASAL(GPRs, Size, DesiredReg, ExpectedReg, AddressReg, StrictSplitLockMutex); } static bool HandleAtomicMemOp(uint32_t Instr, uint64_t* GPRs, uint32_t* StrictSplitLockMutex) { uint32_t Size = 1 << (Instr >> 30); uint32_t ResultReg = Instr & 0b11111; uint32_t SourceReg = (Instr >> 16) & 0b11111; uint32_t AddressReg = (Instr >> 5) & 0b11111; uint64_t Addr = GPRs[AddressReg]; uint8_t Op = (Instr >> 12) & 0xF; if (Size == 2) { auto NOPExpected = [](uint16_t SrcVal, uint16_t) -> uint16_t { return SrcVal; }; auto ADDDesired = [](uint16_t SrcVal, uint16_t Desired) -> uint16_t { return SrcVal + Desired; }; auto CLRDesired = [](uint16_t SrcVal, uint16_t Desired) -> uint16_t { return SrcVal & ~Desired; }; auto EORDesired = [](uint16_t SrcVal, uint16_t Desired) -> uint16_t { return SrcVal ^ Desired; }; auto SETDesired = [](uint16_t SrcVal, uint16_t Desired) -> uint16_t { return SrcVal | Desired; }; auto SWAPDesired = [](uint16_t SrcVal, uint16_t Desired) -> uint16_t { return Desired; }; CASDesiredFn DesiredFunction {}; switch (Op) { case ATOMIC_ADD_OP: DesiredFunction = ADDDesired; break; case ATOMIC_CLR_OP: DesiredFunction = CLRDesired; break; case ATOMIC_EOR_OP: DesiredFunction = EORDesired; break; case ATOMIC_SET_OP: DesiredFunction = SETDesired; break; case ATOMIC_SWAP_OP: DesiredFunction = SWAPDesired; break; default: LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}", Op); return false; } auto Res = DoCAS16(GPRs[SourceReg], 0, // Unused Addr, NOPExpected, DesiredFunction, StrictSplitLockMutex); // If we passed and our destination register is not zero // Then we need to update the result register with what was in memory if (ResultReg != 31) { GPRs[ResultReg] = Res; } return true; } else if (Size == 4) { auto NOPExpected = [](uint32_t SrcVal, uint32_t) -> uint32_t { return SrcVal; }; auto ADDDesired = [](uint32_t SrcVal, uint32_t Desired) -> uint32_t { return SrcVal + Desired; }; auto CLRDesired = [](uint32_t SrcVal, uint32_t Desired) -> uint32_t { return SrcVal & ~Desired; }; auto EORDesired = [](uint32_t SrcVal, uint32_t Desired) -> uint32_t { return SrcVal ^ Desired; }; auto SETDesired = [](uint32_t SrcVal, uint32_t Desired) -> uint32_t { return SrcVal | Desired; }; auto SWAPDesired = [](uint32_t SrcVal, uint32_t Desired) -> uint32_t { return Desired; }; CASDesiredFn DesiredFunction {}; switch (Op) { case ATOMIC_ADD_OP: DesiredFunction = ADDDesired; break; case ATOMIC_CLR_OP: DesiredFunction = CLRDesired; break; case ATOMIC_EOR_OP: DesiredFunction = EORDesired; break; case ATOMIC_SET_OP: DesiredFunction = SETDesired; break; case ATOMIC_SWAP_OP: DesiredFunction = SWAPDesired; break; default: LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}", Op); return false; } auto Res = DoCAS32(GPRs[SourceReg], 0, // Unused Addr, NOPExpected, DesiredFunction, StrictSplitLockMutex); // If we passed and our destination register is not zero // Then we need to update the result register with what was in memory if (ResultReg != 31) { GPRs[ResultReg] = Res; } return true; } else if (Size == 8) { auto NOPExpected = [](uint64_t SrcVal, uint64_t) -> uint64_t { return SrcVal; }; auto ADDDesired = [](uint64_t SrcVal, uint64_t Desired) -> uint64_t { return SrcVal + Desired; }; auto CLRDesired = [](uint64_t SrcVal, uint64_t Desired) -> uint64_t { return SrcVal & ~Desired; }; auto EORDesired = [](uint64_t SrcVal, uint64_t Desired) -> uint64_t { return SrcVal ^ Desired; }; auto SETDesired = [](uint64_t SrcVal, uint64_t Desired) -> uint64_t { return SrcVal | Desired; }; auto SWAPDesired = [](uint64_t SrcVal, uint64_t Desired) -> uint64_t { return Desired; }; CASDesiredFn DesiredFunction {}; switch (Op) { case ATOMIC_ADD_OP: DesiredFunction = ADDDesired; break; case ATOMIC_CLR_OP: DesiredFunction = CLRDesired; break; case ATOMIC_EOR_OP: DesiredFunction = EORDesired; break; case ATOMIC_SET_OP: DesiredFunction = SETDesired; break; case ATOMIC_SWAP_OP: DesiredFunction = SWAPDesired; break; default: LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}", Op); return false; } auto Res = DoCAS64(GPRs[SourceReg], 0, // Unused Addr, NOPExpected, DesiredFunction, StrictSplitLockMutex); // If we passed and our destination register is not zero // Then we need to update the result register with what was in memory if (ResultReg != 31) { GPRs[ResultReg] = Res; } return true; } return false; } static bool HandleAtomicLoad(uint32_t Instr, uint64_t* GPRs, int64_t Offset, Core::UnalignedExclusiveStore* Store = nullptr) { uint32_t Size = 1 << (Instr >> 30); uint32_t ResultReg = Instr & 0b11111; uint32_t AddressReg = (Instr >> 5) & 0b11111; uint64_t Addr = GPRs[AddressReg] + Offset; uint64_t Res; if (Size == 2) { Res = DoLoad16(Addr); // We set the result register if it isn't a zero register if (ResultReg != 31) { GPRs[ResultReg] = Res; } } else if (Size == 4) { Res = DoLoad32(Addr); // We set the result register if it isn't a zero register if (ResultReg != 31) { GPRs[ResultReg] = Res; } } else if (Size == 8) { Res = DoLoad64(Addr); // We set the result register if it isn't a zero register if (ResultReg != 31) { GPRs[ResultReg] = Res; } } else { return false; } if (Store) { Store->Addr = Addr; Store->Store = Res; Store->Size = Size; } return true; } static bool HandleAtomicStore(uint32_t Instr, uint64_t* GPRs, int64_t Offset, uint32_t* StrictSplitLockMutex) { uint32_t Size = 1 << (Instr >> 30); uint32_t DataReg = Instr & 0x1F; uint32_t AddressReg = (Instr >> 5) & 0b11111; uint64_t Addr = GPRs[AddressReg] + Offset; constexpr bool DoRetry = false; if (Size == 2) { DoCAS16( GPRs[DataReg], 0, // Unused Addr, [](uint16_t SrcVal, uint16_t) -> uint16_t { // Expected is just src return SrcVal; }, [](uint16_t, uint16_t Desired) -> uint16_t { // Desired is just Desired return Desired; }, StrictSplitLockMutex); return true; } else if (Size == 4) { DoCAS32( GPRs[DataReg], 0, // Unused Addr, [](uint32_t SrcVal, uint32_t) -> uint32_t { // Expected is just src return SrcVal; }, [](uint32_t, uint32_t Desired) -> uint32_t { // Desired is just Desired return Desired; }, StrictSplitLockMutex); return true; } else if (Size == 8) { DoCAS64( GPRs[DataReg], 0, // Unused Addr, [](uint64_t SrcVal, uint64_t) -> uint64_t { // Expected is just src return SrcVal; }, [](uint64_t, uint64_t Desired) -> uint64_t { // Desired is just Desired return Desired; }, StrictSplitLockMutex); return true; } return false; } static uint64_t HandleCAS_NoAtomics(uintptr_t ProgramCounter, uint64_t* GPRs, uint32_t* StrictSplitLockMutex) { // ARMv8.0 CAS // [1] ldaxrb(TMP2.W(), MemOperand(MemSrc)) // [2] cmp (TMP2.W(), Expected.W()) // [3] b // [4] stlxrb(TMP3.W(), Desired.W(), MemOperand(MemSrc) // [5] cbnz // [6] mov // [7] b // [8] mov (.., TMP2.W()); // [9] clrex uint32_t* PC = (uint32_t*)ProgramCounter; uint32_t Instr = PC[0]; uint32_t Size = 1 << (Instr >> 30); uint32_t AddressReg = GetRnReg(Instr); uint32_t ResultReg = GetRdReg(Instr); // TMP2 uint32_t DesiredReg = 0; uint32_t ExpectedReg = 0; for (size_t i = 1; i < 6; ++i) { uint32_t NextInstr = PC[i]; if ((NextInstr & ArchHelpers::Arm64::STLXR_MASK) == ArchHelpers::Arm64::STLXR_INST) { #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED // Just double check that the memory destination matches const uint32_t StoreAddressReg = GetRnReg(NextInstr); LOGMAN_THROW_A_FMT(StoreAddressReg == AddressReg, "StoreExclusive memory register didn't match the store exclusive register"); #endif DesiredReg = GetRdReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::CMP_INST || (NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::CMP_SHIFT_INST) { ExpectedReg = GetRmReg(NextInstr); } } // set up CASAL by doing mov(TMP2, Expected) GPRs[ResultReg] = GPRs[ExpectedReg]; if (RunCASAL(GPRs, Size, DesiredReg, ResultReg, AddressReg, StrictSplitLockMutex)) { return 7 * sizeof(uint32_t); // jump to mov to allocated register } else { return 0; } } static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_t* GPRs, uint32_t* StrictSplitLockMutex) { uint32_t* PC = (uint32_t*)ProgramCounter; uint32_t Instr = PC[0]; // Atomic Add // [1] ldaxrb(TMP2.W(), MemOperand(MemSrc)); // [2] add(TMP2.W(), TMP2.W(), GetReg(Op->Header.Args[1].ID())); // [3] stlxrb(TMP2.W(), TMP2.W(), MemOperand(MemSrc)); // [4] cbnz(TMP2.W(), &LoopTop); // // Atomic Fetch Add // [1] ldaxrb(TMP2.W(), MemOperand(MemSrc)); // [2] add(TMP3.W(), TMP2.W(), GetReg(Op->Header.Args[1].ID())); // [3] stlxrb(TMP4.W(), TMP3.W(), MemOperand(MemSrc)); // [4] cbnz(TMP4.W(), &LoopTop); // [5] mov(GetReg(Node), TMP2.W()); // // Atomic Swap // // [1] ldaxrb(TMP2.W(), MemOperand(MemSrc)); // [2] stlxrb(TMP4.W(), GetReg(Op->Header.Args[1].ID()), MemOperand(MemSrc)); // [3] cbnz(TMP4.W(), &LoopTop); // [4] uxtb(GetReg(Node), TMP2.W()); // // ASSUMPTIONS: // - Both cases: // - The [2]ALU op: (Non NEG case) // - First source is from [1]ldaxr // - Second source is incoming value // - The [2]ALU op: (NEG case) // - First source is zero register // - The second source is the from [1]ldaxr // - No ALU op: (SWAP case) // - No DataSourceRegister // // - In Atomic case (non-fetch) // - The [3]stlxr instruction status + memory register are the SAME register // // - In Atomic FETCH case // - The [3]stlxr instruction's status + memory register are never the same register // - The [5]mov instruction source is always the destination register from [1] ldaxr* uint32_t ResultReg = GetRdReg(Instr); uint32_t AddressReg = GetRnReg(Instr); uint64_t Addr = GPRs[AddressReg]; size_t NumInstructionsToSkip = 0; // Are we an Atomic op or AtomicFetch? bool AtomicFetch = false; // This is the register that is the incoming source to the ALU operation // = // NEG case is special // = Zero // DataSourceRegister must always be the Rm register uint32_t DataSourceReg {}; ExclusiveAtomicPairType AtomicOp {ExclusiveAtomicPairType::TYPE_SWAP}; // Scan forward at most five instructions to find our instructions for (size_t i = 1; i < 6; ++i) { uint32_t NextInstr = PC[i]; if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::ADD_INST || (NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::ADD_SHIFT_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_ADD; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::SUB_INST || (NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::SUB_SHIFT_INST) { uint32_t RnReg = GetRnReg(NextInstr); if (RnReg == REGISTER_MASK) { // Zero reg means neg AtomicOp = ExclusiveAtomicPairType::TYPE_NEG; } else { AtomicOp = ExclusiveAtomicPairType::TYPE_SUB; } DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::CMP_INST || (NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::CMP_SHIFT_INST) { return HandleCAS_NoAtomics(ProgramCounter, GPRs, StrictSplitLockMutex); // ARMv8.0 CAS } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::AND_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_AND; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::BIC_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_BIC; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::OR_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_OR; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::ORN_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_ORN; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::EOR_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_EOR; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::ALU_OP_MASK) == ArchHelpers::Arm64::EON_INST) { AtomicOp = ExclusiveAtomicPairType::TYPE_EON; DataSourceReg = GetRmReg(NextInstr); } else if ((NextInstr & ArchHelpers::Arm64::STLXR_MASK) == ArchHelpers::Arm64::STLXR_INST) { #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED // Just double check that the memory destination matches const uint32_t StoreAddressReg = GetRnReg(NextInstr); LOGMAN_THROW_A_FMT(StoreAddressReg == AddressReg, "StoreExclusive memory register didn't match the store exclusive register"); #endif uint32_t StatusReg = GetRmReg(NextInstr); uint32_t StoreResultReg = GetRdReg(NextInstr); // We are an atomic fetch instruction if the data register isn't the status register AtomicFetch = !(StatusReg == StoreResultReg); if (AtomicOp == ExclusiveAtomicPairType::TYPE_SWAP) { // In the case of swap we don't have an ALU op inbetween // Source is directly in STLXR DataSourceReg = StoreResultReg; } } else if ((NextInstr & ArchHelpers::Arm64::CBNZ_MASK) == ArchHelpers::Arm64::CBNZ_INST) { // Found the CBNZ, we want to skip to just after this instruction when done NumInstructionsToSkip = i + 1; // This is the last instruction we care about. Leave now break; } else { LogMan::Msg::AFmt("Unknown instruction 0x{:08x}", NextInstr); } } uint32_t Size = 1 << (Instr >> 30); constexpr bool DoRetry = true; auto NOPExpected = [](AtomicType SrcVal, AtomicType) -> AtomicType { return SrcVal; }; auto ADDDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal + Desired; }; auto SUBDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal - Desired; }; auto ANDDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal & Desired; }; auto BICDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal & ~Desired; }; auto ORDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal | Desired; }; auto ORNDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal | ~Desired; }; auto EORDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal ^ Desired; }; auto EONDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return SrcVal ^ ~Desired; }; auto NEGDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return -SrcVal; }; auto SWAPDesired = [](AtomicType SrcVal, AtomicType Desired) -> AtomicType { return Desired; }; if (Size == 2) { using AtomicType = uint16_t; CASDesiredFn DesiredFunction {}; switch (AtomicOp) { case ExclusiveAtomicPairType::TYPE_SWAP: DesiredFunction = SWAPDesired; break; case ExclusiveAtomicPairType::TYPE_ADD: DesiredFunction = ADDDesired; break; case ExclusiveAtomicPairType::TYPE_SUB: DesiredFunction = SUBDesired; break; case ExclusiveAtomicPairType::TYPE_AND: DesiredFunction = ANDDesired; break; case ExclusiveAtomicPairType::TYPE_BIC: DesiredFunction = BICDesired; break; case ExclusiveAtomicPairType::TYPE_OR: DesiredFunction = ORDesired; break; case ExclusiveAtomicPairType::TYPE_ORN: DesiredFunction = ORNDesired; break; case ExclusiveAtomicPairType::TYPE_EOR: DesiredFunction = EORDesired; break; case ExclusiveAtomicPairType::TYPE_EON: DesiredFunction = EONDesired; break; case ExclusiveAtomicPairType::TYPE_NEG: DesiredFunction = NEGDesired; break; default: LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}", FEXCore::ToUnderlying(AtomicOp)); return false; } auto Res = DoCAS16(GPRs[DataSourceReg], 0, // Unused Addr, NOPExpected, DesiredFunction, StrictSplitLockMutex); if (AtomicFetch && ResultReg != 31) { // On atomic fetch then we store the resulting value back in to the loadacquire destination register // We want the memory value BEFORE the ALU op GPRs[ResultReg] = Res; } } else if (Size == 4) { using AtomicType = uint32_t; CASDesiredFn DesiredFunction {}; switch (AtomicOp) { case ExclusiveAtomicPairType::TYPE_SWAP: DesiredFunction = SWAPDesired; break; case ExclusiveAtomicPairType::TYPE_ADD: DesiredFunction = ADDDesired; break; case ExclusiveAtomicPairType::TYPE_SUB: DesiredFunction = SUBDesired; break; case ExclusiveAtomicPairType::TYPE_AND: DesiredFunction = ANDDesired; break; case ExclusiveAtomicPairType::TYPE_BIC: DesiredFunction = BICDesired; break; case ExclusiveAtomicPairType::TYPE_OR: DesiredFunction = ORDesired; break; case ExclusiveAtomicPairType::TYPE_ORN: DesiredFunction = ORNDesired; break; case ExclusiveAtomicPairType::TYPE_EOR: DesiredFunction = EORDesired; break; case ExclusiveAtomicPairType::TYPE_EON: DesiredFunction = EONDesired; break; case ExclusiveAtomicPairType::TYPE_NEG: DesiredFunction = NEGDesired; break; default: LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}", FEXCore::ToUnderlying(AtomicOp)); return false; } auto Res = DoCAS32(GPRs[DataSourceReg], 0, // Unused Addr, NOPExpected, DesiredFunction, StrictSplitLockMutex); if (AtomicFetch && ResultReg != 31) { // On atomic fetch then we store the resulting value back in to the loadacquire destination register // We want the memory value BEFORE the ALU op GPRs[ResultReg] = Res; } } else if (Size == 8) { using AtomicType = uint64_t; CASDesiredFn DesiredFunction {}; switch (AtomicOp) { case ExclusiveAtomicPairType::TYPE_SWAP: DesiredFunction = SWAPDesired; break; case ExclusiveAtomicPairType::TYPE_ADD: DesiredFunction = ADDDesired; break; case ExclusiveAtomicPairType::TYPE_SUB: DesiredFunction = SUBDesired; break; case ExclusiveAtomicPairType::TYPE_AND: DesiredFunction = ANDDesired; break; case ExclusiveAtomicPairType::TYPE_BIC: DesiredFunction = BICDesired; break; case ExclusiveAtomicPairType::TYPE_OR: DesiredFunction = ORDesired; break; case ExclusiveAtomicPairType::TYPE_ORN: DesiredFunction = ORNDesired; break; case ExclusiveAtomicPairType::TYPE_EOR: DesiredFunction = EORDesired; break; case ExclusiveAtomicPairType::TYPE_EON: DesiredFunction = EONDesired; break; case ExclusiveAtomicPairType::TYPE_NEG: DesiredFunction = NEGDesired; break; default: LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}", FEXCore::ToUnderlying(AtomicOp)); return false; } auto Res = DoCAS64(GPRs[DataSourceReg], 0, // Unused Addr, NOPExpected, DesiredFunction, StrictSplitLockMutex); if (AtomicFetch && ResultReg != 31) { // On atomic fetch then we store the resulting value back in to the loadacquire destination register // We want the memory value BEFORE the ALU op GPRs[ResultReg] = Res; } } // Multiply by 4 for number of bytes to skip return NumInstructionsToSkip * 4; } [[nodiscard]] std::optional HandleUnalignedAccess(FEXCore::Core::InternalThreadState* Thread, UnalignedHandlerType HandleType, uintptr_t ProgramCounter, uint64_t* GPRs, bool IsJIT) { #ifdef ARCHITECTURE_arm64 constexpr bool is_arm64 = true; #else constexpr bool is_arm64 = false; #endif if constexpr (!is_arm64) { return std::nullopt; } uint32_t* PC = (uint32_t*)ProgramCounter; uint32_t Instr = PC[0]; // 1 = 16bit // 2 = 32bit // 3 = 64bit uint32_t Size = (Instr & 0xC000'0000) >> 30; uint32_t AddrReg = (Instr >> 5) & 0x1F; uint32_t DataReg = Instr & 0x1F; auto CTX = static_cast(Thread->CTX); uint32_t* StrictSplitLockMutex {CTX->Config.StrictInProcessSplitLocks ? &CTX->StrictSplitLockMutex : nullptr}; if (!IsJIT) [[unlikely]] { if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, 0)) { // Skip this instruction now return 4; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS LDAR*: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & LDAXR_MASK) == STLR_INST) { // STLR* if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, 0, StrictSplitLockMutex)) { // Skip this instruction now return 4; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS STLR*: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR* // Extract the 9-bit offset from the instruction int32_t Offset = static_cast(Instr) << 11 >> 23; if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, Offset)) { // Skip this instruction now return 4; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS LDAPUR*: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR* // Extract the 9-bit offset from the instruction int32_t Offset = static_cast(Instr) << 11 >> 23; if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, Offset, StrictSplitLockMutex)) { // Skip this instruction now return 4; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS LDLUR*: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & ArchHelpers::Arm64::LDAXR_MASK) == ArchHelpers::Arm64::LDAXR_INST) { // LDAXR* if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, 0, &Thread->ExclusiveStore)) { return 4; } } else if ((Instr & ArchHelpers::Arm64::STLXR_MASK) == ArchHelpers::Arm64::STLXR_INST) { // STLXR* uint32_t StatusReg = Instr << 11 >> 27; // // Emulate exclusive store by validating the address and value against the last unaligned LDAXR*. if (GPRs[AddrReg] != Thread->ExclusiveStore.Addr || Size > Thread->ExclusiveStore.Size) { if (StatusReg != 31) { GPRs[StatusReg] = 1; } return 4; } if (std::optional Prev = DoCAS(Size, DataReg == 31 ? 0 : GPRs[DataReg], Thread->ExclusiveStore.Store, GPRs[AddrReg], StrictSplitLockMutex)) { if (StatusReg != 31) { GPRs[StatusReg] = !!memcmp(&Thread->ExclusiveStore.Store, &*Prev, Size); } Thread->ExclusiveStore.Size = 0; return 4; } } return 0; } const auto Frame = Thread->CurrentFrame; const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; auto InlineHeader = reinterpret_cast(BlockBegin); auto InlineTail = reinterpret_cast(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail); // Check some instructions first that don't do any backpatching. if ((Instr & ArchHelpers::Arm64::CASPAL_MASK) == ArchHelpers::Arm64::CASPAL_INST) { // CASPAL if (ArchHelpers::Arm64::HandleCASPAL(Instr, GPRs, StrictSplitLockMutex)) { // Skip this instruction now return 4; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS CASPAL: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & ArchHelpers::Arm64::CASAL_MASK) == ArchHelpers::Arm64::CASAL_INST) { // CASAL if (ArchHelpers::Arm64::HandleCASAL(GPRs, Instr, StrictSplitLockMutex)) { // Skip this instruction now return 4; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS CASAL: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* (Instr & LDAXR_MASK) == LDAPR_INST || // LDAPR* (Instr & LDAXR_MASK) == STLR_INST) { // STLR* // This must fall through to the spin-lock implementation below. // This mask has a partial overlap with ATOMIC_MEM_INST so we need to check this here. } else if ((Instr & ArchHelpers::Arm64::ATOMIC_MEM_MASK) == ArchHelpers::Arm64::ATOMIC_MEM_INST) { // Atomic memory op if (ArchHelpers::Arm64::HandleAtomicMemOp(Instr, GPRs, StrictSplitLockMutex)) { // Skip this instruction now return 4; } else { uint8_t Op = (PC[0] >> 12) & 0xF; LogMan::Msg::EFmt("Unhandled JIT SIGBUS Atomic mem op 0x{:02x}: PC: 0x{:x} Instruction: 0x{:08x}\n", Op, ProgramCounter, PC[0]); return std::nullopt; } } else if ((Instr & ArchHelpers::Arm64::LDAXR_MASK) == ArchHelpers::Arm64::LDAXR_INST) { // LDAXR* uint64_t BytesToSkip = ArchHelpers::Arm64::HandleAtomicLoadstoreExclusive(ProgramCounter, GPRs, StrictSplitLockMutex); if (BytesToSkip) { // Skip this instruction now return BytesToSkip; } // Explicit fallthrough to the backpatch handler below! } else if ((Instr & ArchHelpers::Arm64::LDAXP_MASK) == ArchHelpers::Arm64::LDAXP_INST) { // LDAXP // Should be compare and swap pair only. LDAXP not used elsewhere uint64_t BytesToSkip = ArchHelpers::Arm64::HandleCASPAL_ARMv8(Instr, ProgramCounter, GPRs, StrictSplitLockMutex); if (BytesToSkip) { // Skip this instruction now return BytesToSkip; } else { LogMan::Msg::EFmt("Unhandled JIT SIGBUS CASPAL: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } // Lock code mutex during any SIGBUS handling that potentially changes code. // Due to code buffer sharing between threads, code must be carefully backpatched from last to first. // Multiple threads can be attempting to handle the SIGBUS or even be executing the code being backpatched. FEXCore::Utils::SpinWaitLock::UniqueSpinMutex lk(&InlineTail->SpinLockFutex); if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* uint32_t LDR = LDR_INST; LDR |= Size << 30; LDR |= AddrReg << 5; LDR |= DataReg; if (HandleType != UnalignedHandlerType::NonAtomic) { // Ordering matters with cross-thread visibility! std::atomic_ref(PC[1]).store(DMB_LD, std::memory_order_release); // Back-patch the half-barrier. } std::atomic_ref(PC[0]).store(LDR, std::memory_order_release); ClearICache(&PC[0], 8); // With the instruction modified, now execute again. return 0; } else if ((Instr & LDAXR_MASK) == STLR_INST) { // STLR* uint32_t STR = STR_INST; STR |= Size << 30; STR |= AddrReg << 5; STR |= DataReg; if (HandleType != UnalignedHandlerType::NonAtomic) { std::atomic_ref(PC[-1]).store(DMB, std::memory_order_release); // Back-patch the half-barrier. } std::atomic_ref(PC[0]).store(STR, std::memory_order_release); ClearICache(&PC[-1], 8); // Back up one instruction and have another go return -4; } else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR* // Extract the 9-bit offset from the instruction uint32_t LDUR = LDUR_INST; LDUR |= Size << 30; LDUR |= AddrReg << 5; LDUR |= DataReg; LDUR |= Instr & (0b1'1111'1111 << 12); if (HandleType != UnalignedHandlerType::NonAtomic) { // Ordering matters with cross-thread visibility! std::atomic_ref(PC[1]).store(DMB_LD, std::memory_order_release); // Back-patch the half-barrier. } std::atomic_ref(PC[0]).store(LDUR, std::memory_order_release); ClearICache(&PC[0], 8); // With the instruction modified, now execute again. return 0; } else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR* uint32_t STUR = STUR_INST; STUR |= Size << 30; STUR |= AddrReg << 5; STUR |= DataReg; STUR |= Instr & (0b1'1111'1111 << 12); if (HandleType != UnalignedHandlerType::NonAtomic) { std::atomic_ref(PC[-1]).store(DMB, std::memory_order_release); // Back-patch the half-barrier. } std::atomic_ref(PC[0]).store(STUR, std::memory_order_release); ClearICache(&PC[-1], 8); // Back up one instruction and have another go return -4; } // Check if another thread backpatched this instruction before this thread got here // Since we got here, this can happen in a couple situations: // - Unhandled instruction (Shouldn't occur, FEX programmer error added a new unhandled atomic) // - Another thread backpatched an atomic access to be a non-atomic access auto AtomicInst = std::atomic_ref(PC[0]).load(std::memory_order_acquire); if ((AtomicInst & LDSTREGISTER_MASK) == LDR_INST || (AtomicInst & LDSTUNSCALED_MASK) == LDUR_INST) { // This atomic instruction was backpatched to a load. if (HandleType != UnalignedHandlerType::NonAtomic) { // Check if the next instruction is a DMB. auto DMBInst = std::atomic_ref(PC[1]).load(std::memory_order_acquire); if (DMBInst == DMB_LD) { return 0; } } else { // No DMB instruction with this HandleType. return 0; } } else if ((AtomicInst & LDSTREGISTER_MASK) == STR_INST || (AtomicInst & LDSTUNSCALED_MASK) == STUR_INST) { if (HandleType != UnalignedHandlerType::NonAtomic) { // Check if the previous instruction is a DMB. auto DMBInst = std::atomic_ref(PC[-1]).load(std::memory_order_acquire); if (DMBInst == DMB) { // Return handled, make sure to adjust PC so we run the DMB. return -4; } } else { // No DMB instruction with this HandleType. return 0; } } else if (AtomicInst == DMB) { // ARMv8.0-a LDAXP backpatch handling. Will have turned in to the following: // - PC[0] = DMB // - PC[1] = STP // - PC[2] = DMB auto STPInst = std::atomic_ref(PC[1]).load(std::memory_order_acquire); auto DMBInst = std::atomic_ref(PC[2]).load(std::memory_order_acquire); if ((STPInst & LDSTP_MASK) == STP_INST && DMBInst == DMB) { // Code that was backpatched is what was expected for ARMv8.0-a LDAXP. return 0; } } LogMan::Msg::EFmt("Unhandled JIT SIGBUS: PC: 0x{:x} Instruction: 0x{:08x}\n", ProgramCounter, PC[0]); return std::nullopt; } } // namespace FEXCore::ArchHelpers::Arm64 ================================================ FILE: FEXCore/Source/Utils/ArchHelpers/Arm64_stubs.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #include namespace FEXCore::ArchHelpers::Arm64 { #ifndef ARCHITECTURE_arm64 // These are stub implementations that exist only to allow instantiating the arm64 jit // on non arm platforms. // Obvously such a configuration can't do the actual arm64-specific stuff std::optional HandleUnalignedAccess(FEXCore::Core::InternalThreadState* Thread, UnalignedHandlerType HandleType, uintptr_t ProgramCounter, uint64_t* GPRs) { ERROR_AND_DIE_FMT("HandleAtomicMemOp Not Implemented"); } #endif } // namespace FEXCore::ArchHelpers::Arm64 ================================================ FILE: FEXCore/Source/Utils/BucketList.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include namespace FEXCore { // BucketList is an optimized container, it includes an inline array of Size // and can overflow to a linked list of further buckets // // To optimize for best performance, Size should be big enough to allocate one or two // buckets for the typical case // Picking a Size so sizeof(Bucket<...>) is a power of two is also a small win template struct BucketList { static constexpr size_t Size = _Size; T Items[Size]; fextl::unique_ptr> Next; void Clear() { Items[0] = T {}; #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED for (size_t i = 1; i < Size; i++) { Items[i] = T {0xDEADBEEF}; } #endif Next.reset(); } BucketList() { Clear(); } template void Iterate(EnumeratorFn Enumerator) const { size_t i = 0; auto Bucket = this; while (true) { auto Item = Bucket->Items[i]; if (Item == T {}) { break; } Enumerator(Item); if (++i == Size) { LOGMAN_THROW_A_FMT(Bucket->Next != nullptr, "Interference bug"); Bucket = Bucket->Next.get(); i = 0; } } } template bool Find(EnumeratorFn Enumerator) const { size_t i = 0; auto Bucket = this; while (true) { auto Item = Bucket->Items[i]; if (Item == T {}) { break; } if (Enumerator(Item)) { return true; } if (++i == Size) { LOGMAN_THROW_A_FMT(Bucket->Next != nullptr, "Bucket in bad state"); Bucket = Bucket->Next.get(); i = 0; } } return false; } void Append(T Val) { auto that = this; while (that->Next) { that = that->Next.get(); } size_t i; for (i = 0; i < Size; i++) { if (that->Items[i] == T {}) { that->Items[i] = Val; break; } } if (i < (Size - 1)) { that->Items[i + 1] = T {}; } else { that->Next = fextl::make_unique>(); } } void Erase(T Val) { size_t i = 0; auto that = this; auto foundThat = this; size_t foundI = 0; while (true) { if (that->Items[i] == Val) { foundThat = that; foundI = i; break; } else if (++i == Size) { i = 0; LOGMAN_THROW_A_FMT(that->Next != nullptr, "Bucket::Erase but element not contained"); that = that->Next.get(); } } while (true) { if (that->Items[i] == T {}) { foundThat->Items[foundI] = that->Items[i - 1]; that->Items[i - 1] = T {}; break; } else if (++i == Size) { if (that->Next->Items[0] == T {}) { that->Next.reset(); foundThat->Items[foundI] = that->Items[Size - 1]; that->Items[Size - 1] = T {}; break; } i = 0; that = that->Next.get(); } } } }; } // namespace FEXCore ================================================ FILE: FEXCore/Source/Utils/Config.h ================================================ // SPDX-License-Identifier: MIT #include namespace FEXCore::Config { const fextl::string& GetTelemetryDirectory(); } ================================================ FILE: FEXCore/Source/Utils/FileLoading.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #include #include #include #include #include #ifdef _WIN32 #include #endif namespace FEXCore::FileLoading { #ifndef _WIN32 template static bool LoadFileImpl(T& Data, const fextl::string& Filepath, size_t FixedSize) { int FD = open(Filepath.c_str(), O_RDONLY); if (FD == -1) { return false; } size_t FileSize {}; if (FixedSize == 0) { struct stat buf; if (fstat(FD, &buf) == 0) { FileSize = buf.st_size; } } else { FileSize = FixedSize; } ssize_t CurrentOffset = 0; ssize_t Read = -1; bool LoadedFile {}; if (FileSize) { // File size is known upfront Data.resize(FileSize); size_t Remaining = FileSize; while (CurrentOffset != FileSize && (Read = pread(FD, &Data.at(CurrentOffset), Remaining, CurrentOffset)) > 0) { CurrentOffset += Read; Remaining -= Read; } LoadedFile = CurrentOffset == FileSize && Read != -1; } else { // The file is either empty or its size is unknown (e.g. procfs data). // Try reading in chunks instead constexpr size_t READ_SIZE = 4096; Data.resize(READ_SIZE); while ((Read = pread(FD, &Data.at(CurrentOffset), READ_SIZE, CurrentOffset)) > 0) { CurrentOffset += Read; if ((CurrentOffset + READ_SIZE) > Data.size()) { Data.resize(CurrentOffset + READ_SIZE); } } if (Read == -1) { Data.clear(); close(FD); return false; } // Final resize to ensure there is no garbage data past the end. Data.resize(CurrentOffset + Read); LoadedFile = true; } close(FD); return LoadedFile; } ssize_t LoadFileToBuffer(const fextl::string& Filepath, std::span Buffer) { int FD = open(Filepath.c_str(), O_RDONLY); if (FD == -1) { return -1; } ssize_t Read = pread(FD, Buffer.data(), Buffer.size(), 0); close(FD); return Read; } #else template static bool LoadFileImpl(T& Data, const fextl::string& Filepath, size_t FixedSize) { std::ifstream f(Filepath.c_str(), std::ios::binary | std::ios::ate); if (f.fail()) { return false; } auto Size = f.tellg(); f.seekg(0, std::ios::beg); Data.resize(Size); f.read(Data.data(), Size); return !f.fail(); } ssize_t LoadFileToBuffer(const fextl::string& Filepath, std::span Buffer) { std::ifstream f(Filepath.c_str(), std::ios::binary | std::ios::ate); return f.readsome(Buffer.data(), Buffer.size()); } #endif bool LoadFile(fextl::vector& Data, const fextl::string& Filepath, size_t FixedSize) { return LoadFileImpl(Data, Filepath, FixedSize); } bool LoadFile(fextl::string& Data, const fextl::string& Filepath, size_t FixedSize) { return LoadFileImpl(Data, Filepath, FixedSize); } } // namespace FEXCore::FileLoading ================================================ FILE: FEXCore/Source/Utils/ForcedAssert.cpp ================================================ // SPDX-License-Identifier: MIT namespace FEXCore::Assert { // This function can not be inlined [[noreturn]] __attribute__((noinline, naked)) void ForcedAssert() { #ifdef ARCHITECTURE_x86_64 asm volatile("ud2"); #else asm volatile("hlt #1"); #endif } } // namespace FEXCore::Assert ================================================ FILE: FEXCore/Source/Utils/LogManager.cpp ================================================ // SPDX-License-Identifier: MIT /* $info$ tags: glue|log-manager $end_info$ */ #include #include #include namespace LogMan { namespace Throw { ThrowHandler Handler {}; void InstallHandler(ThrowHandler _Handler) { Handler = _Handler; } void UnInstallHandler() { Handler = nullptr; } void MFmt(const char* fmt, const fmt::format_args& args) { if (Handler) { auto msg = fextl::fmt::vformat(fmt, args); Handler(msg.c_str()); } FEX_TRAP_EXECUTION; } } // namespace Throw namespace Msg { MsgHandler Handler {}; void InstallHandler(MsgHandler _Handler) { Handler = _Handler; } void UnInstallHandler() { Handler = nullptr; } void MFmtImpl(DebugLevels level, const char* fmt, const fmt::format_args& args) { if (Handler) { const auto msg = fextl::fmt::vformat(fmt, args); Handler(level, msg.c_str()); } } } // namespace Msg } // namespace LogMan ================================================ FILE: FEXCore/Source/Utils/LongJump.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #include namespace FEXCore::UncheckedLongJump { #if defined(ARCHITECTURE_arm64) [[nodiscard]] FEX_DEFAULT_VISIBILITY FEX_NAKED uint64_t SetJump(JumpBuf& Buffer) { __asm volatile(R"( // x0 contains the jumpbuffer stp x19, x20, [x0, #( 0 * 8)]; stp x21, x22, [x0, #( 2 * 8)]; stp x23, x24, [x0, #( 4 * 8)]; stp x25, x26, [x0, #( 6 * 8)]; stp x27, x28, [x0, #( 8 * 8)]; stp x29, x30, [x0, #(10 * 8)]; // FPRs stp d8, d9, [x0, #(12 * 8)]; stp d10, d11, [x0, #(14 * 8)]; stp d12, d13, [x0, #(16 * 8)]; stp d14, d15, [x0, #(18 * 8)]; // Move SP in to a temporary to store. mov x1, sp; str x1, [x0, #(20 * 8)]; // Return zero to signify this is the SetJump. mov x0, #0; ret; )" :: : "memory"); } [[noreturn]] FEX_DEFAULT_VISIBILITY FEX_NAKED void LongJump(const JumpBuf& Buffer, uint64_t Value) { __asm volatile(R"( // x0 contains the jumpbuffer ldp x19, x20, [x0, #( 0 * 8)]; ldp x21, x22, [x0, #( 2 * 8)]; ldp x23, x24, [x0, #( 4 * 8)]; ldp x25, x26, [x0, #( 6 * 8)]; ldp x27, x28, [x0, #( 8 * 8)]; ldp x29, x30, [x0, #(10 * 8)]; // FPRs ldp d8, d9, [x0, #(12 * 8)]; ldp d10, d11, [x0, #(14 * 8)]; ldp d12, d13, [x0, #(16 * 8)]; ldp d14, d15, [x0, #(18 * 8)]; // Load SP in to temporary then move ldr x0, [x0, #(20 * 8)]; mov sp, x0; // Move value in to result register mov x0, x1; ret; )" :: : "memory"); } FEX_DEFAULT_VISIBILITY void ManuallyLoadJumpBuf(const JumpBuf& Buffer, uint64_t Value, uint64_t* GPRs, __uint128_t* FPRs, uint64_t* PC) { // First 12 values are registers [x19,x30]. memcpy(&GPRs[19], &Buffer.Registers[0], sizeof(uint64_t) * 12); // Next 8 values are [D8,D15] // Retain upper 64-bits of the register, only modifying lower 64-bits. for (size_t i = 0; i < 8; ++i) { memcpy(&FPRs[8 + i], &Buffer.Registers[12 + i], sizeof(uint64_t)); } // Last value is stack pointer memcpy(&GPRs[31], &Buffer.Registers[20], sizeof(uint64_t)); // Load the expected value in to X0 GPRs[0] = Value; // Load the PC with the current LR. *PC = GPRs[30]; } #else [[nodiscard]] FEX_DEFAULT_VISIBILITY FEX_NAKED uint64_t SetJump(JumpBuf& Buffer) { __asm volatile(R"( .intel_syntax noprefix; // rdi contains the jumpbuffer mov [rdi + (0 * 8)], rbx; mov [rdi + (1 * 8)], rsp; mov [rdi + (2 * 8)], rbp; mov [rdi + (3 * 8)], r12; mov [rdi + (4 * 8)], r13; mov [rdi + (5 * 8)], r14; mov [rdi + (6 * 8)], r15; // Return address is on the stack, load it and store mov rsi, [rsp]; mov [rdi + (7 * 8)], rsi; // Return zero to signify this is the SetJump. mov rax, 0; ret; .att_syntax prefix; )" :: : "memory"); } [[noreturn]] FEX_DEFAULT_VISIBILITY FEX_NAKED void LongJump(const JumpBuf& Buffer, uint64_t Value) { __asm volatile(R"( .intel_syntax noprefix; // rdi contains the jumpbuffer mov rbx, [rdi + (0 * 8)]; mov rsp, [rdi + (1 * 8)]; mov rbp, [rdi + (2 * 8)]; mov r12, [rdi + (3 * 8)]; mov r13, [rdi + (4 * 8)]; mov r14, [rdi + (5 * 8)]; mov r15, [rdi + (6 * 8)]; // Move value in to result register mov rax, rsi; // Pop the dead return address off the stack pop rsi; // Load the original return address from the jumpbuffer mov rsi, [rdi + (7 * 8)]; // Return using a jump jmp rsi; .att_syntax prefix; )" :: : "memory"); } FEX_DEFAULT_VISIBILITY void ManuallyLoadJumpBuf(JumpBuf& Buffer, uint64_t Value, uint64_t* GPRs, __uint128_t* FPRs, uint64_t* PC) { LOGMAN_MSG_A_FMT("This is unimplemented on x86-64"); } #endif } // namespace FEXCore::UncheckedLongJump ================================================ FILE: FEXCore/Source/Utils/MemberFunctionToPointer.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include namespace FEXCore::Utils { /** * @brief Casts a class's member function pointer to a raw pointer that we can JIT * * Has additional validation to ensure we aren't casting a class member that is invalid */ template class MemberFunctionToPointerCast final { public: MemberFunctionToPointerCast(PointerToMemberType Function) { memcpy(&PMF, &Function, sizeof(PMF)); } uintptr_t GetConvertedPointer() const { #ifdef ARCHITECTURE_x86_64 // Itanium C++ ABI (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#member-function-pointers) // Low bit of ptr specifies if this Member function pointer is virtual or not // Throw an assert if we were trying to cast a virtual member LOGMAN_THROW_A_FMT((PMF.ptr & 1) == 0, "C++ Pointer-To-Member representation didn't have low bit set to 0. Are you trying to cast a " "virtual member?"); #elif defined(ARCHITECTURE_arm64) // C++ ABI for the Arm 64-bit Architecture (IHI 0059E) // 4.2.1 Representation of pointer to member function // Differs from Itanium specification LOGMAN_THROW_A_FMT(PMF.adj == 0, "C++ Pointer-To-Member representation didn't have adj == 0. Are you trying to cast a virtual member?"); #else #error "Don't know how to cast Member to function here. Likely just Itanium" #endif return PMF.ptr; } // Gets the vtable entry position of a virtual member function. size_t GetVTableOffset() const { #ifdef ARCHITECTURE_x86_64 // Itanium C++ ABI (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#member-function-pointers) // Low bit of ptr specifies if this Member function pointer is virtual or not // Throw an assert if we are not loading a virtual member. LOGMAN_THROW_A_FMT((PMF.ptr & 1) == 1, "C++ Pointer-To-Member representation didn't have low bit set to 1. This cast only works for " "virtual members."); return PMF.ptr & ~1ULL; #elif defined(ARCHITECTURE_arm64) // C++ ABI for the Arm 64-bit Architecture (IHI 0059E) // 4.2.1 Representation of pointer to member function // Differs from Itanium specification LOGMAN_THROW_A_FMT((PMF.adj & 1) == 1, "C++ Pointer-To-Member representation didn't have adj == 1. This cast only works for virtual " "members."); return PMF.ptr; #else #error "Don't know how to cast Member to function here. Likely just Itanium" #endif } // Gets the pointer to the vtable entry for the object passed it. template uintptr_t GetVTableEntry(Class* VirtualClass) const { // VTable is always stored at the beginning of a class object. uintptr_t* VTable = *reinterpret_cast(VirtualClass); size_t Offset = GetVTableOffset() / sizeof(void*); return VTable[Offset]; } private: struct PointerToMember { uintptr_t ptr; uintptr_t adj; }; PointerToMember PMF; // Ensure the representation of PointerToMember matches static_assert(sizeof(PMF) == sizeof(PointerToMemberType)); }; } // namespace FEXCore::Utils ================================================ FILE: FEXCore/Source/Utils/Profiler.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #ifndef _WIN32 #include #include #include #endif #include #include #include #include #include #ifdef ENABLE_FEXCORE_PROFILER #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_GPUVIS #include #include #include #ifndef _WIN32 static inline uint64_t GetTime() { // We want the time in the least amount of overhead possible // clock_gettime will do a VDSO call with the least amount of overhead struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * 1'000'000'000ULL + ts.tv_nsec; } #else static inline uint64_t GetTime() { // GetTime needs to return nanoseconds, query the interface. static uint64_t FrequencyScale = {}; if (!FrequencyScale) [[unlikely]] { LARGE_INTEGER Frequency {}; while (!QueryPerformanceFrequency(&Frequency)) ; constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; // On WINE this will always result in a scale of 100. FrequencyScale = NanosecondsInSecond / Frequency.QuadPart; } LARGE_INTEGER ticks; while (!QueryPerformanceCounter(&ticks)) ; return ticks.QuadPart * FrequencyScale; } #endif namespace FEXCore::Profiler { ProfilerBlock::ProfilerBlock(const std::string_view Format) : DurationBegin {GetTime()} , Format {Format} {} ProfilerBlock::~ProfilerBlock() { auto Duration = GetTime() - DurationBegin; TraceObject(Format, Duration); } } // namespace FEXCore::Profiler namespace GPUVis { // ftrace FD for writing trace data. // Needs to be a raw FD since we hold this open for the entire application execution. static int TraceFD {-1}; // Need to search the paths to find the real trace path static std::array TraceFSDirectories { "/sys/kernel/tracing", "/sys/kernel/debug/tracing", }; void Init() { FEX_CONFIG_OPT(EnableGpuvisProfiling, ENABLEGPUVISPROFILING); if (!EnableGpuvisProfiling()) { return; } for (auto Path : TraceFSDirectories) { #ifdef _WIN32 constexpr auto flags = O_WRONLY; #else constexpr auto flags = O_WRONLY | O_CLOEXEC; #endif fextl::string FilePath = fextl::fmt::format("{}/trace_marker", Path); TraceFD = open(FilePath.c_str(), flags); if (TraceFD != -1) { // Opened TraceFD, early exit break; } } } void Shutdown() { if (TraceFD != -1) { close(TraceFD); TraceFD = -1; } } void TraceObject(const std::string_view Format, uint64_t Duration) { if (TraceFD != -1) { // Print the duration as something that began negative duration ago const auto StringSize = Format.size() + strlen(" (lduration=-)\n") + 22; auto Event = reinterpret_cast(alloca(StringSize)); auto Res = ::fmt::format_to_n(Event, StringSize, "{} (lduration=-{})\n", Format, Duration); write(TraceFD, Event, Res.size); } } void TraceObject(const std::string_view Format) { if (TraceFD != -1) { const auto StringSize = Format.size() + 1; auto Event = reinterpret_cast(alloca(StringSize)); auto Res = ::fmt::format_to_n(Event, StringSize, "{}\n", Format); write(TraceFD, Event, Res.size); } } } // namespace GPUVis #elif FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY #include "tracy/Tracy.hpp" namespace Tracy { static int EnableAfterFork = 0; static bool Enable = false; void Init(std::string_view ProgramName, std::string_view ProgramPath) { const char* ProfileTargetName = getenv("FEX_PROFILE_TARGET_NAME"); // Match by application name const char* ProfileTargetPath = getenv("FEX_PROFILE_TARGET_PATH"); // Match by path suffix const char* WaitForFork = getenv("FEX_PROFILE_WAIT_FOR_FORK"); // Don't enable profiling until the process forks N times bool Matched = (ProfileTargetName && ProgramName == ProfileTargetName) || (ProfileTargetPath && ProgramPath.ends_with(ProfileTargetPath)); if (Matched && WaitForFork) { EnableAfterFork = std::atoi(WaitForFork); } Enable = Matched && !EnableAfterFork; if (Enable) { tracy::StartupProfiler(); LogMan::Msg::IFmt("Tracy profiling started"); } else if (EnableAfterFork) { LogMan::Msg::IFmt("Tracy profiling will start after fork"); } } void PostForkAction(bool IsChild) { if (Enable) { // Tracy does not support multiprocess profiling LogMan::Msg::EFmt("Warning: Profiling a process with forks is not supported. Set the environment variable " "FEX_PROFILE_WAIT_FOR_FORK= to start profiling after the n-th fork."); } if (IsChild) { Enable = false; return; } if (EnableAfterFork > 1) { --EnableAfterFork; LogMan::Msg::IFmt("Tracy profiling will start after {} forks", EnableAfterFork); } else if (EnableAfterFork == 1) { Enable = true; EnableAfterFork = 0; tracy::StartupProfiler(); LogMan::Msg::IFmt("Tracy profiling started"); } } void Shutdown() { if (Tracy::Enable) { LogMan::Msg::IFmt("Stopping Tracy profiling"); tracy::ShutdownProfiler(); } } void TraceObject(const std::string_view Format, uint64_t Duration) {} void TraceObject(const std::string_view Format) { if (Tracy::Enable) { TracyMessage(Format.data(), Format.size()); } } } // namespace Tracy #else #error Unknown profiler backend #endif #endif namespace FEXCore::Profiler { #ifdef ENABLE_FEXCORE_PROFILER void Init(std::string_view ProgramName, std::string_view ProgramPath) { #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_GPUVIS GPUVis::Init(); #elif FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY Tracy::Init(ProgramName, ProgramPath); #endif } void PostForkAction(bool IsChild) { #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY Tracy::PostForkAction(IsChild); #endif } bool IsActive() { #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_GPUVIS // Always active return true; #elif FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY // Active if previously enabled return Tracy::Enable; #endif } void Shutdown() { #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_GPUVIS GPUVis::Shutdown(); #elif FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY Tracy::Shutdown(); #endif } void TraceObject(const std::string_view Format, uint64_t Duration) { #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_GPUVIS GPUVis::TraceObject(Format, Duration); #elif FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY Tracy::TraceObject(Format, Duration); #endif } void TraceObject(const std::string_view Format) { #if FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_GPUVIS GPUVis::TraceObject(Format); #elif FEXCORE_PROFILER_BACKEND == FEXCORE_PROFILER_BACKEND_TRACY Tracy::TraceObject(Format); #endif } #endif } // namespace FEXCore::Profiler ================================================ FILE: FEXCore/Source/Utils/SpinWaitLock.cpp ================================================ // SPDX-License-Identifier: MIT #include "Utils/SpinWaitLock.h" namespace FEXCore::Utils::SpinWaitLock { #ifdef ARCHITECTURE_arm64 constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; static uint64_t GetCycleCounterFrequency() { uint64_t Result {}; __asm("mrs %[Res], CNTFRQ_EL0" : [Res] "=r"(Result)); return Result; } static uint64_t CalculateCyclesPerNanosecond() { // Snapdragon devices historically use a 19.2Mhz cycle counter frequency // This means that the number of cycles per nanosecond ends up being 52.0833... // // ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz. // This means the number of cycles per nanosecond ends up being 1. uint64_t CounterFrequency = GetCycleCounterFrequency(); return NanosecondsInSecond / CounterFrequency; } uint64_t CycleCounterFrequency = GetCycleCounterFrequency(); uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond(); #endif } // namespace FEXCore::Utils::SpinWaitLock ================================================ FILE: FEXCore/Source/Utils/SpinWaitLock.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include namespace FEXCore::Utils::SpinWaitLock { /** * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. * * Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact * by putting the CPU in to a lower-power state using WFE. * On platforms tested, WFE will put the CPU in to a lower power state for upwards of 0.11ms(!) per WFE. Which isn't a significant amount of * time but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the * added benefit that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits. * * This short timeout is because the Linux kernel has a 100 microsecond architecture timer which wakes up WFE and WFI. Nothing can be * improved beyond that period. * * FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using. * * It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this * implementation is slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient. * * On non-ARM platforms it is truly a spin-loop, which is okay for debugging only. */ #ifdef ARCHITECTURE_arm64 #define LOADEXCLUSIVE(LoadExclusiveOp, RegSize) \ /* Prime the exclusive monitor with the passed in address. */ \ #LoadExclusiveOp " %" #RegSize "[Result], [%[Futex]];\n" #define SPINLOOP_BODY(LoadAtomicOp, RegSize) \ /* WFE will wait for either the memory to change or spurious wake-up. */ \ "wfe;\n" /* Load with acquire to get the result of memory. */ \ #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]];\n" #define SPINLOOP_WFE_LDX_8BIT LOADEXCLUSIVE(ldaxrb, w) #define SPINLOOP_WFE_LDX_16BIT LOADEXCLUSIVE(ldaxrh, w) #define SPINLOOP_WFE_LDX_32BIT LOADEXCLUSIVE(ldaxr, w) #define SPINLOOP_WFE_LDX_64BIT LOADEXCLUSIVE(ldaxr, x) #define SPINLOOP_8BIT SPINLOOP_BODY(ldarb, w) #define SPINLOOP_16BIT SPINLOOP_BODY(ldarh, w) #define SPINLOOP_32BIT SPINLOOP_BODY(ldar, w) #define SPINLOOP_64BIT SPINLOOP_BODY(ldar, x) extern uint64_t CycleCounterFrequency; extern uint64_t CyclesPerNanosecond; ///< Get the raw cycle counter which is synchronizing. /// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. static inline uint64_t GetCycleCounter() { uint64_t Result {}; __asm volatile(R"( isb; mrs %[Res], CNTVCT_EL0; )" : [Res] "=r"(Result)); return Result; } ///< Converts nanoseconds to number of cycles. /// If the cycle counter is 1Ghz then this is a direct 1:1 map. static inline uint64_t ConvertNanosecondsToCycles(const std::chrono::nanoseconds& Nanoseconds) { const auto NanosecondCount = Nanoseconds.count(); return NanosecondCount / CyclesPerNanosecond; } static inline uint8_t LoadExclusive(uint8_t* Futex) { uint8_t Result {}; __asm volatile(SPINLOOP_WFE_LDX_8BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint16_t LoadExclusive(uint16_t* Futex) { uint16_t Result {}; __asm volatile(SPINLOOP_WFE_LDX_16BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint32_t LoadExclusive(uint32_t* Futex) { uint32_t Result {}; __asm volatile(SPINLOOP_WFE_LDX_32BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint64_t LoadExclusive(uint64_t* Futex) { uint64_t Result {}; __asm volatile(SPINLOOP_WFE_LDX_64BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint8_t WFELoadAtomic(uint8_t* Futex) { uint8_t Result {}; __asm volatile(SPINLOOP_8BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint16_t WFELoadAtomic(uint16_t* Futex) { uint16_t Result {}; __asm volatile(SPINLOOP_16BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint32_t WFELoadAtomic(uint32_t* Futex) { uint32_t Result {}; __asm volatile(SPINLOOP_32BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } static inline uint64_t WFELoadAtomic(uint64_t* Futex) { uint64_t Result {}; __asm volatile(SPINLOOP_64BIT : [Result] "=r"(Result), [Futex] "+r"(Futex)::"memory"); return Result; } template static inline void WaitPred(T* Futex, T ComparisonValue) { auto AtomicFutex = std::atomic_ref(*Futex); T Result = AtomicFutex.load(); while (!Pred {}(Result, ComparisonValue)) { Result = LoadExclusive(Futex); if (Pred {}(Result, ComparisonValue)) { return; } Result = WFELoadAtomic(Futex); } } template static inline bool Wait(T* Futex, TT ExpectedValue, const std::chrono::nanoseconds& Timeout) { auto AtomicFutex = std::atomic_ref(*Futex); T Result = AtomicFutex.load(); // Early exit if possible. if (Result == ExpectedValue) { return true; } const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout); const auto Begin = GetCycleCounter(); do { Result = LoadExclusive(Futex); if (Result == ExpectedValue) { return true; } Result = WFELoadAtomic(Futex); const auto CurrentCycleCounter = GetCycleCounter(); if ((CurrentCycleCounter - Begin) >= TimeoutCycles) { // Couldn't get value before timeout. return false; } } while (Result != ExpectedValue); // We got our result. return true; } template bool Wait(uint8_t*, uint8_t, const std::chrono::nanoseconds&); template bool Wait(uint16_t*, uint16_t, const std::chrono::nanoseconds&); template bool Wait(uint32_t*, uint32_t, const std::chrono::nanoseconds&); template bool Wait(uint64_t*, uint64_t, const std::chrono::nanoseconds&); template static inline T OneShotWFEBitComparison(T* Futex, T Mask, T Comp) { auto AtomicFutex = std::atomic_ref(*Futex); T Result = AtomicFutex.load(); // Early exit if possible. if ((Result & Mask) == Comp) { return Result; } Result = LoadExclusive(Futex); if ((Result & Mask) == Comp) { return Result; } // Waits for write and returns result. Result = WFELoadAtomic(Futex); return Result; } #else template static inline void WaitPred(T* Futex, T ComparisonValue) { auto AtomicFutex = std::atomic_ref(*Futex); T Result = AtomicFutex.load(); while (!Pred {}(Result, ComparisonValue)) { Result = AtomicFutex.load(); } } template static inline bool Wait(T* Futex, TT ExpectedValue, const std::chrono::nanoseconds& Timeout) { auto AtomicFutex = std::atomic_ref(*Futex); T Result = AtomicFutex.load(); // Early exit if possible. if (Result == ExpectedValue) { return true; } const auto Begin = std::chrono::high_resolution_clock::now(); do { Result = AtomicFutex.load(); const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now(); if ((CurrentCycleCounter - Begin) >= Timeout) { // Couldn't get value before timeout. return false; } } while (Result != ExpectedValue); // We got our result. return true; } #endif template static inline void Wait(T* Futex, TT ExpectedValue) { WaitPred, T>(Futex, ExpectedValue); } template void Wait(uint8_t*, uint8_t); template void Wait(uint16_t*, uint16_t); template void Wait(uint32_t*, uint32_t); template void Wait(uint64_t*, uint64_t); template static inline void lock(T* Futex) { auto AtomicFutex = std::atomic_ref(*Futex); T Expected {}; T Desired {1}; // Try to CAS immediately. if (AtomicFutex.compare_exchange_strong(Expected, Desired)) { return; } do { // Wait until the futex is unlocked. Wait(Futex, 0); Expected = 0; } while (!AtomicFutex.compare_exchange_strong(Expected, Desired)); } template static inline bool try_lock(T* Futex) { auto AtomicFutex = std::atomic_ref(*Futex); T Expected {}; T Desired {1}; // Try to CAS immediately. if (AtomicFutex.compare_exchange_strong(Expected, Desired)) { return true; } return false; } template static inline void unlock(T* Futex) { auto AtomicFutex = std::atomic_ref(*Futex); AtomicFutex.store(0); } #undef SPINLOOP_8BIT #undef SPINLOOP_16BIT #undef SPINLOOP_32BIT #undef SPINLOOP_64BIT template class UniqueSpinMutex final { public: // Move-only type UniqueSpinMutex(const UniqueSpinMutex&) = delete; UniqueSpinMutex& operator=(const UniqueSpinMutex&) = delete; UniqueSpinMutex(UniqueSpinMutex&& rhs) = default; UniqueSpinMutex& operator=(UniqueSpinMutex&&) = default; UniqueSpinMutex(T* Futex) : Futex {Futex} { FEXCore::Utils::SpinWaitLock::lock(Futex); } ~UniqueSpinMutex() { FEXCore::Utils::SpinWaitLock::unlock(Futex); } private: T* Futex; }; } // namespace FEXCore::Utils::SpinWaitLock ================================================ FILE: FEXCore/Source/Utils/Telemetry.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #include #include #include #include #include #include "Utils/Config.h" #include #include #include #include namespace FEXCore::Telemetry { #ifndef FEX_DISABLE_TELEMETRY std::array TelemetryValues = {{}}; const std::array TelemetryNames { "64byte Split Locks", "16byte Split atomics", "EVEX instructions (AVX512)", "16bit CAS Tear", "32bit CAS Tear", "64bit CAS Tear", "128bit CAS Tear", "Crash mask", "Write 32-bit Segment ES", "Write 32-bit Segment SS", "Write 32-bit Segment CS", "Write 32-bit Segment DS", "Uses 32-bit Segment ES", "Uses 32-bit Segment SS", "Uses 32-bit Segment CS", "Uses 32-bit Segment DS", "Non-Canonical 64-bit address access", }; static bool Enabled {true}; void Initialize() { FEX_CONFIG_OPT(DisableTelemetry, DISABLETELEMETRY); if (DisableTelemetry) { Enabled = false; return; } const auto& DataDirectory = Config::GetTelemetryDirectory(); // Ensure the folder structure is created for our configuration if (!FHU::Filesystem::Exists(DataDirectory) && !FHU::Filesystem::CreateDirectories(DataDirectory)) { LogMan::Msg::IFmt("Couldn't create telemetry Folder"); } } void Shutdown(const fextl::string& ApplicationName) { if (!Enabled) { return; } auto DataDirectory = Config::GetTelemetryDirectory() + ApplicationName + ".telem"; // Retain a single backup if the telemetry already existed. auto Backup = DataDirectory + ".bck"; // Failure on rename is okay. (void)FHU::Filesystem::RenameFile(DataDirectory, Backup); auto File = FEXCore::File::File(DataDirectory.c_str(), FEXCore::File::FileModes::WRITE | FEXCore::File::FileModes::CREATE | FEXCore::File::FileModes::TRUNCATE); if (File.IsValid()) { for (size_t i = 0; i < TelemetryType::TYPE_LAST; ++i) { auto& Name = TelemetryNames.at(i); auto& Data = TelemetryValues.at(i); fextl::fmt::print(File, "{}: {}\n", Name, Data.load()); } File.Flush(); } } #endif } // namespace FEXCore::Telemetry ================================================ FILE: FEXCore/Source/Utils/Threads.cpp ================================================ // SPDX-License-Identifier: MIT #include #include #include #include #include namespace FEXCore::Threads { static fextl::unique_ptr CreateThread_Default(ThreadFunc Func, void* Arg) { ERROR_AND_DIE_FMT("Frontend didn't setup thread creation!"); } static void CleanupAfterFork_Default() { ERROR_AND_DIE_FMT("Frontend didn't setup thread creation!"); } static FEXCore::Threads::Pointers Ptrs = { .CreateThread = CreateThread_Default, .CleanupAfterFork = CleanupAfterFork_Default, }; fextl::unique_ptr FEXCore::Threads::Thread::Create(ThreadFunc Func, void* Arg) { return Ptrs.CreateThread(Func, Arg); } void FEXCore::Threads::Thread::CleanupAfterFork() { return Ptrs.CleanupAfterFork(); } void FEXCore::Threads::Thread::SetInternalPointers(const Pointers& _Ptrs) { Ptrs = _Ptrs; } } // namespace FEXCore::Threads ================================================ FILE: FEXCore/Source/Utils/WritePriorityMutex.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #if !defined(_WIN32) #include /* Definition of FUTEX_* constants */ #include /* Definition of SYS_* constants */ #include #else #include #endif #include #include "Utils/SpinWaitLock.h" namespace FEXCore::Utils::WritePriorityMutex { // A custom mutex that prioritizes exclusive locks. // In highly contested scenarios, this can help minimize overall contention time. // // Features: // - Up to 32767 pending exclusive locks ("writers") // - Up to 32767 pending shared_locks ("readers") // - Low-overhead waiting via WFE with a fallback to futex on timeout // - Direct writer->reader hand-off and vice-versa to further reduce overhead // // Trade-offs: // - No guaranteed order of wake-ups besides prioritizing writers // - No support for recursive locking // - We can't use FUTEX_LOCK_PI to enable priority inheritance class Mutex final { public: Mutex() = default; // Move-only type Mutex(const Mutex&) = delete; Mutex& operator=(const Mutex&) = delete; Mutex(Mutex&& rhs) = delete; Mutex& operator=(Mutex&&) = delete; void lock() { // Try a non-blocking lock first. if (try_lock()) { return; } // Try a quick WFE write-lock. if (Attempt_WFE_WriteLock()) { return; } // Still couldn't get it. Start waiting. auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected {}; uint32_t Desired {}; #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED Expected = AtomicFutex.load(std::memory_order_relaxed); do { // Increment the number of write waiters. Desired = Expected + WRITE_WAITER_INCREMENT; LOGMAN_THROW_A_FMT((Desired & WRITE_WAITER_COUNT_MASK) != 0, "Overflow in write-waiters!"); } while (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire) == false); #else // Increment the number of writers waiting. The following loop will attempt to acquire the write-lock while decrementing the waiter count. Expected = AtomicFutex.fetch_add(WRITE_WAITER_INCREMENT); Desired = Expected + WRITE_WAITER_INCREMENT; #endif // Thread added to waiter list. Expected = Desired; while (true) { bool Sleep = false; do { if ((Expected & WRITE_OWNED_BIT) == 0 && (Expected & READ_OWNER_COUNT_MASK) == 0) { // If not write-owned, and no read-owners, try to acquire. LOGMAN_THROW_A_FMT((Expected & WRITE_WAITER_COUNT_MASK) != 0, "Underflow in write-waiters!"); // Add write-owned bit. Desired = Expected | WRITE_OWNED_BIT; // Remove ourselves from the wait list. Desired -= WRITE_WAITER_INCREMENT; Sleep = false; } else { // Already write-owned or read-locked. Go to sleep. Desired = Expected; Sleep = true; break; } } while (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire) == false); if (!Sleep) { // Acquired early. LOGMAN_THROW_A_FMT((Desired & WRITE_OWNED_BIT) == WRITE_OWNED_BIT, "Somehow acquired a write-lock without it being set!"); return; } // Two paths to get here. // Desired[31] = 1 (WRITE_OWNED_BIT) // OR // Desired[15:0] != 0 (READ_OWNER_COUNT_MASK) // Meaning that there was already a writer that owned the lock, or reads were owning it. // This thread already incremented `WRITE_WAITER_INCREMENT` before this loop. // - Linux waits for the full 32-bits to change (With bitset wakeup). // - Win32 also waits for the full 32-bits to change (with offset addr on the reader side to reduce stampeding). FutexWaitForWriteAvailable(Desired); Expected = AtomicFutex.load(std::memory_order_relaxed); } } void lock_shared() { // Try an uncontended lock first. if (try_lock_shared()) { return; } // Try a quick WFE read-lock. if (Attempt_WFE_ReadLock()) { return; } auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected = AtomicFutex.load(std::memory_order_relaxed); uint32_t Desired {}; while (true) { bool Sleep = false; do { if ((Expected & WRITE_OWNED_BIT) == 0 && (Expected & WRITE_WAITER_COUNT_MASK) == 0) { // If no write-owner and no write-waiting, try and acquire. Desired = Expected + READ_OWNER_INCREMENT; LOGMAN_THROW_A_FMT((Desired & READ_OWNER_COUNT_MASK) != 0, "Overflow in read-owners!"); Sleep = false; } else { // Waiting for lock to become available. Add to waiters. Desired = Expected | READ_WAITER_BIT; Sleep = true; } } while (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire) == false); if (!Sleep) { // Acquired early. LOGMAN_THROW_A_FMT((Desired & WRITE_OWNED_BIT) != WRITE_OWNED_BIT, "Somehow read-locked and got a write lock!"); return; } // Only one path to get here. // Desired[31][29:16] != 0 (Either writer-owned, or writer-waiting) // Desired[30][15:0] == READ_WAIT_BIT and number of read-owners (draining to zero as write-side is set) // - Linux waits for full 32-bit futex. // - Win32 waits for upper 16-bits to not match (Either zero writer owned, writer-wait is draining, and `READ_WAITER_BIT` changed). // Can get some spurious wake-ups which will `or` the `READ_WAITER_BIT` again, which does nothing. FutexWaitForReadAvailable(Desired); Expected = AtomicFutex.load(std::memory_order_relaxed); } } void unlock() { auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected = AtomicFutex.load(std::memory_order_relaxed); uint32_t Desired {}; do { LOGMAN_THROW_A_FMT((Expected & WRITE_OWNED_BIT) == WRITE_OWNED_BIT, "Trying to write-unlock something not write-locked!"); // Remove the exclusive lock bit. Desired = Expected & ~WRITE_OWNED_BIT; // If no more writers, then make sure to clear the read-waiters bit as well. if ((Desired & WRITE_WAITER_COUNT_MASK) == 0) { Desired &= ~READ_WAITER_BIT; } } while (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire) == false); // `Expected` has old value. Containing `READ_WAITER_BIT` which was just masked off, and also `WRITE_WAITER_COUNT_MASK`. // // Two paths here to be careful about dead-locking other waiters: // - If there are any writers waiting, those get priority to wake. // - If there are zero writers waiting, and there are read waiters then make sure to wake them all. // Failure to send wake events can cause readers to "infinitely" hang! (ignoring spurious wake-up). if ((Expected & WRITE_WAITER_COUNT_MASK)) { // Handle write-write handoff. FutexWakeWriter(); } else if ((Expected & READ_WAITER_BIT)) { // Handle write-reader handoff. FutexWakeReaders(); } } void unlock_shared() { auto AtomicFutex = std::atomic_ref(Futex); uint32_t Desired {}; #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED uint32_t Expected = AtomicFutex.load(std::memory_order_relaxed); do { LOGMAN_THROW_A_FMT((Expected & WRITE_OWNED_BIT) != WRITE_OWNED_BIT, "Trying to read-unlock something write-locked!"); LOGMAN_THROW_A_FMT((Expected & READ_OWNER_COUNT_MASK) != 0, "Trying to read-unlock something not read-locked!"); // Decrement the shared counter. Desired = Expected - READ_OWNER_INCREMENT; } while (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire) == false); #else Desired = AtomicFutex.fetch_sub(READ_OWNER_INCREMENT) - READ_OWNER_INCREMENT; #endif // Handle read->write handoff if there are any waiting writers, and no readers left. // Only one path here but still need to be careful to not dead-lock waiting writers. // - If there are waiters /but/ this is not the final unlock_shared, then don't wake writer. // - Writer would wake and immediately sleep again if we woke on every unlock_shared. // - If there are waiters and this is the final unlock_shared, then wake a /single/ writer. // - We ignore any reader-waiters here as they must wait their turn for writers that are waiting. if ((Desired & WRITE_WAITER_COUNT_MASK) && (Desired & READ_OWNER_COUNT_MASK) == 0) { FutexWakeWriter(); } } bool try_lock() { auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected = 0; // Try and grab the owned bit. uint32_t Desired = WRITE_OWNED_BIT; // try to CAS immediately. return AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire); } // Can race with other threads trying to lock shared! bool try_lock_shared() { auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected = AtomicFutex.load(std::memory_order_relaxed); // Exclusively owned or has a list of waiting owners. Can't pass. if ((Expected & WRITE_OWNED_BIT) || (Expected & WRITE_WAITER_COUNT_MASK)) { return false; } // Try to add reader. uint32_t Desired = Expected + READ_OWNER_INCREMENT; LOGMAN_THROW_A_FMT((Desired & READ_OWNER_COUNT_MASK) != 0, "Overflow in read-owners!"); // Uncontended mutex check return AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire); } #if !defined(_WIN32) // Initialize the internal mutex object to its default initializer state. // Should only ever be used in the child process when a Linux fork() has occured. void StealAndDropActiveLocks() { Futex = 0; } #endif private: #if !defined(_WIN32) void FutexWaitForWriteAvailable(uint32_t Expected) { ::syscall(SYS_futex, &Futex, FUTEX_PRIVATE_FLAG | FUTEX_WAIT_BITSET, Expected, nullptr, nullptr, FUTEX_BITSET_WAIT_WRITERS); } // Read-lock waiting for writers to drain out. void FutexWaitForReadAvailable(uint32_t Expected) { ::syscall(SYS_futex, &Futex, FUTEX_PRIVATE_FLAG | FUTEX_WAIT_BITSET, Expected, nullptr, nullptr, FUTEX_BITSET_WAIT_READERS); } // Read-Lock or Write-lock unlocked, wake one writer. // - Read->Write handoff. // - Write->Write handoff. void FutexWakeWriter() { ::syscall(SYS_futex, &Futex, FUTEX_PRIVATE_FLAG | FUTEX_WAKE_BITSET, 1, nullptr, nullptr, FUTEX_BITSET_WAIT_WRITERS); } // Write-lock unlocked, wake read-locks waiting. void FutexWakeReaders() { // Wake all readers. ::syscall(SYS_futex, &Futex, FUTEX_PRIVATE_FLAG | FUTEX_WAKE_BITSET, INT_MAX, nullptr, nullptr, FUTEX_BITSET_WAIT_READERS); } #else // Writers wait for the full 32-bit futex. void FutexWaitForWriteAvailable(uint32_t Expected) { WaitOnAddress(&Futex, &Expected, sizeof(Futex), INFINITE); } // Readers wait for Futex bits [31:16] to be zero. void FutexWaitForReadAvailable(uint32_t Expected) { auto ReadWaiterAddress = reinterpret_cast(&Futex) + 2; uint16_t smol_Expected = Expected >> 16; WaitOnAddress(ReadWaiterAddress, &smol_Expected, sizeof(smol_Expected), INFINITE); } void FutexWakeWriter() { WakeByAddressSingle(&Futex); } void FutexWakeReaders() { auto ReadWaiterAddress = reinterpret_cast(&Futex) + 2; WakeByAddressAll(ReadWaiterAddress); } #endif // Reuse the SpinWaitLock WFE implementations for read/write lock acquiring with WFE. // Can't reuse the spin-lock directly as some bit-representations are different. // WFE-write-lock is less likely to occur the more read-lock threads are participating. Can still occur so good to try. // WFE-read-lock is actually quite likely to succeed. // Return: true if the lock was acquired. bool Attempt_WFE_WriteLock() { #ifdef ARCHITECTURE_arm64 const auto Begin = FEXCore::Utils::SpinWaitLock::GetCycleCounter(); auto Now = Begin; const auto Duration = FEXCore::Utils::SpinWaitLock::CycleCounterFrequency / CYCLECOUNT_DIVISOR; auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected = AtomicFutex.load(std::memory_order_relaxed); while ((Now - Begin) < Duration) { if (Expected == 0) { // Try and grab the owned bit. uint32_t Desired = WRITE_OWNED_BIT; if (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire)) { return true; } } // One-shot attempt to wait for mask to be zero. Expected = FEXCore::Utils::SpinWaitLock::OneShotWFEBitComparison(&Futex, ~0U, 0U); Now = FEXCore::Utils::SpinWaitLock::GetCycleCounter(); } #endif return false; } // Return: true if the lock was acquired. bool Attempt_WFE_ReadLock() { #ifdef ARCHITECTURE_arm64 // Spin on a WFE for a short-amount of time, waiting for write-owned and writer-count to be zero. // - Attempt to acquire read-lock at that point. // - Don't add read-waiters bit on failure, return false. const auto Begin = FEXCore::Utils::SpinWaitLock::GetCycleCounter(); auto Now = Begin; const auto Duration = FEXCore::Utils::SpinWaitLock::CycleCounterFrequency / CYCLECOUNT_DIVISOR; auto AtomicFutex = std::atomic_ref(Futex); uint32_t Expected = AtomicFutex.load(std::memory_order_relaxed); uint32_t Desired {}; while ((Now - Begin) < Duration) { if ((Expected & WRITE_OWNED_BIT) == 0 && (Expected & WRITE_WAITER_COUNT_MASK) == 0) { // If no write-owner and no write-waiting, try and acquire. Desired = Expected + READ_OWNER_INCREMENT; LOGMAN_THROW_A_FMT((Desired & READ_OWNER_COUNT_MASK) != 0, "Overflow in read-owners!"); if (AtomicFutex.compare_exchange_strong(Expected, Desired, std::memory_order_acq_rel, std::memory_order_acquire)) { return true; } } // One-shot attempt to wait for mask to be zero. Expected = FEXCore::Utils::SpinWaitLock::OneShotWFEBitComparison(&Futex, WRITE_OWNED_BIT | WRITE_WAITER_COUNT_MASK, 0U); Now = FEXCore::Utils::SpinWaitLock::GetCycleCounter(); } #endif return false; } constexpr static uint32_t WRITE_OWNED_BIT = 1U << 31; constexpr static uint32_t READ_WAITER_BIT = 1U << 30; constexpr static uint32_t WRITE_WAITER_OFFSET = 16; constexpr static uint32_t WRITE_WAITER_INCREMENT = 1U << WRITE_WAITER_OFFSET; constexpr static uint32_t READ_OWNER_INCREMENT = 1; // Count masks constexpr static uint32_t WRITE_WAITER_COUNT_MASK = 0x3FFFU << WRITE_WAITER_OFFSET; constexpr static uint32_t READ_OWNER_COUNT_MASK = 0xFFFFU; // Independent futex bit-set masks. // Wait for readers to drain. constexpr static uint32_t FUTEX_BITSET_WAIT_READERS = 1U << 0; // Wait for writers to drain. constexpr static uint32_t FUTEX_BITSET_WAIT_WRITERS = 1U << 1; // Only spin on WFE for 0.01ms (10k ns). constexpr static uint64_t CYCLECOUNT_DIVISOR = 1'000'000'000ULL / 10'000U; // Layout: // Bits[31]: Write-lock bit. // Bits[30]: Read-waiter bit. // Bits[29:16]: Write-waiter count. // Bits[15:0]: Read-owner count. uint32_t Futex {}; }; } // namespace FEXCore::Utils::WritePriorityMutex ================================================ FILE: FEXCore/Source/Utils/variable_length_integer.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include namespace FEXCore::Utils { // Variable length signed integer // The most common encoded size is 8-bit positive, but other values can occur // // 8-bit: // bit[7] = 0 - 8-bit // bit[6:0] = 7-bit encoding // // 16-bit: // byte1[7:6] = 0b10 - 16-bit // byte1[5:0] = top 6-bits // byte2[7:0] = Bottom 8-bits bits // // 32-bit // byte1[7:5] = 0b110 - 32-bit // byte1[4:0] = // word[31:0] = signed word // // 64-bit // byte1[7:5] = 0b111 - 64-bit // byte1[4:0] = // dword[63:0] = signed dword struct vl64 final { static size_t EncodedSize(int64_t Data) { if (Data >= vl8_min && Data <= vl8_max) { return sizeof(vl8_enc); } else if (Data >= vl16_min && Data <= vl16_max) { return sizeof(vl16_enc); } else if (Data >= vl32_min && Data <= vl32_max) { return sizeof(vl32_enc); } return sizeof(vl64_enc); } struct Decoded { int64_t Integer; size_t Size; }; static Decoded Decode(const uint8_t* data) { auto vl8_type = reinterpret_cast(data); auto vl16_type = reinterpret_cast(data); auto vl32_type = reinterpret_cast(data); auto vl64_type = reinterpret_cast(data); if (vl8_type->Type == vl8_type_header) { return {vl8_type->Integer, sizeof(vl8_enc)}; } else if (vl16_type->HighBits.Type == vl16_type_header) { return {vl16_type->Integer(), sizeof(vl16_enc)}; } else if (vl32_type->Type == vl32_type_header) { return {vl32_type->Integer, sizeof(vl32_enc)}; } return {vl64_type->Integer, sizeof(vl64_enc)}; } static size_t Encode(uint8_t* dst, int64_t Data) { auto vl8_type = reinterpret_cast(dst); auto vl16_type = reinterpret_cast(dst); auto vl32_type = reinterpret_cast(dst); auto vl64_type = reinterpret_cast(dst); if (Data >= vl8_min && Data <= vl8_max) { *vl8_type = { .Integer = static_cast(Data), .Type = vl8_type_header, }; return sizeof(vl8_enc); } else if (Data >= vl16_min && Data <= vl16_max) { *vl16_type = { .HighBits { .Top = static_cast((Data >> 8) & 0xFF), .Type = vl16_type_header, }, .LowBits = static_cast(Data & 0xFF), }; return sizeof(vl16_enc); } else if (Data >= vl32_min && Data <= vl32_max) { *vl32_type = { .Type = vl32_type_header, .Integer = static_cast(Data), }; return sizeof(vl32_enc); } *vl64_type = { .Type = vl64_type_header, .Integer = Data, }; return sizeof(vl64_enc); } private: struct vl8_enc { int8_t Integer : 7; uint8_t Type : 1; }; static_assert(sizeof(vl8_enc) == 1); struct vl16_enc { struct { int8_t Top : 6; uint8_t Type : 2; } HighBits; uint8_t LowBits; int64_t Integer() const { int16_t Value {}; Value |= (HighBits.Top << 8); Value |= LowBits; return (Value << 2) >> 2; } }; static_assert(sizeof(vl16_enc) == 2); struct FEX_PACKED vl32_enc { uint8_t Type; int32_t Integer; }; static_assert(sizeof(vl32_enc) == 5); struct FEX_PACKED vl64_enc { uint8_t Type; int64_t Integer; }; static_assert(sizeof(vl64_enc) == 9); // Maximum ranges for encodings. // vl8 can hold a signed 7-bit integer. // Encoded in one 8-bit value. constexpr static int64_t vl8_encoded_bits = 7; constexpr static int64_t vl8_type_header = 0; constexpr static int64_t vl8_min = std::numeric_limits::min() >> ((sizeof(int64_t) * 8) - vl8_encoded_bits); constexpr static int64_t vl8_max = std::numeric_limits::max() >> ((sizeof(int64_t) * 8) - vl8_encoded_bits); // vl16 can hold a signed 14-bit integer. // Encoded in one 16-bit value. constexpr static int64_t vl16_encoded_bits = 14; constexpr static int64_t vl16_type_header = 0b10; constexpr static int64_t vl16_min = std::numeric_limits::min() >> ((sizeof(int64_t) * 8) - vl16_encoded_bits); constexpr static int64_t vl16_max = std::numeric_limits::max() >> ((sizeof(int64_t) * 8) - vl16_encoded_bits); // vl32 can hold a signed 32-bit integer. // Encoded in 8-bit and 32-bit value; constexpr static int64_t vl32_encoded_bits = 32; constexpr static int64_t vl32_type_header = 0b1100'0000; constexpr static int64_t vl32_min = std::numeric_limits::min(); constexpr static int64_t vl32_max = std::numeric_limits::max(); // vl64 can hold a signed 32-bit integer. // Encoded in 8-bit and 64-bit value. constexpr static int64_t vl64_encoded_bits = 64; constexpr static int64_t vl64_type_header = 0b1110'0000; constexpr static int64_t vl64_min = std::numeric_limits::min(); constexpr static int64_t vl64_max = std::numeric_limits::max(); }; // Variable length pair that optimizes around FEXCore's JITRIPReconstruction. // // 8-bit: // bit[7] = 0 - 8-bit // bit[6:4] = 3-bit unsigned - 1. [1 - 8] range. // bit[3:0] = 4-bit unsigned divided by 4 - 1. [4 - 64 byte] range. // // 16-bit: // byte1[7:6] = 0b10 - 16-bit // byte1[5:0] = 6-bit signed value [-32 - 31] range // byte2[7:0] = 8-bit signed value divided by 4. [-512 - 508] byte range. // // 32-bit and 64-bit don't attempt to do any compression beyond range checks. // 32-bit // byte1[7:5] = 0b110 - 32-bit // byte1[4:0] = // word1[31:0] = signed word // word2[31:0] = signed word // // 64-bit // byte1[7:5] = 0b111 - 64-bit // byte1[4:0] = // dword1[63:0] = signed dword // dword2[63:0] = signed dword struct vl64pair final { public: static size_t EncodedSize(uint64_t data_arm, uint64_t data_rip) { if (can_encode_vl8(data_arm, data_rip)) { return sizeof(vl8_enc); } else if (can_encode_vl16(data_arm, data_rip)) { return sizeof(vl16_enc); } else if (can_encode_vl32(data_arm, data_rip)) { return sizeof(vl32_enc); } return sizeof(vl64_enc); } struct Decoded { uint64_t IntegerARMPC; uint64_t IntegerX86RIP; size_t Size; }; static Decoded Decode(const uint8_t* data) { auto vl8_type = reinterpret_cast(data); auto vl16_type = reinterpret_cast(data); auto vl32_type = reinterpret_cast(data); auto vl64_type = reinterpret_cast(data); if (vl8_type->Type == vl8_type_header) { return Decode(vl8_type); } else if (vl16_type->HighBits.Type == vl16_type_header) { return Decode(vl16_type); } else if (vl32_type->Type == vl32_type_header) { return Decode(vl32_type); } return {vl64_type->IntegerARMPC, vl64_type->IntegerX86RIP, sizeof(vl64_enc)}; } static size_t Encode(uint8_t* dst, uint64_t data_arm, uint64_t data_rip) { auto vl8_type = reinterpret_cast(dst); auto vl16_type = reinterpret_cast(dst); auto vl32_type = reinterpret_cast(dst); auto vl64_type = reinterpret_cast(dst); if (can_encode_vl8(data_arm, data_rip)) { *vl8_type = { .IntegerARMPC = static_cast((data_arm - 1) >> vl8_arm_align_bits), .IntegerX86RIP = static_cast(data_rip - 1), .Type = vl8_type_header, }; return sizeof(vl8_enc); } else if (can_encode_vl16(data_arm, data_rip)) { *vl16_type = { .HighBits { .IntegerX86RIP = static_cast(static_cast(data_rip)), .Type = vl16_type_header, }, .IntegerARMPC = static_cast(static_cast(data_arm) >> vl8_arm_align_bits), }; return sizeof(vl16_enc); } else if (can_encode_vl32(data_arm, data_rip)) { *vl32_type = { .Type = vl32_type_header, .IntegerARMPC = static_cast(data_arm), .IntegerX86RIP = static_cast(data_rip), }; return sizeof(vl32_enc); } *vl64_type = { .Type = vl64_type_header, .IntegerARMPC = data_arm, .IntegerX86RIP = data_rip, }; return sizeof(vl64_enc); } private: struct vl8_enc { uint8_t IntegerARMPC : 4; uint8_t IntegerX86RIP : 3; uint8_t Type : 1; }; static_assert(sizeof(vl8_enc) == 1); static inline Decoded Decode(const vl8_enc* enc) { const uint64_t data_arm = enc->IntegerARMPC; const uint64_t data_rip = enc->IntegerX86RIP; return {(data_arm + 1) << vl8_arm_align_bits, data_rip + 1, sizeof(vl8_enc)}; } struct vl16_enc { struct { int8_t IntegerX86RIP : 6; uint8_t Type : 2; } HighBits; int8_t IntegerARMPC; }; static_assert(sizeof(vl16_enc) == 2); static inline Decoded Decode(const vl16_enc* enc) { int64_t arm_pc = enc->IntegerARMPC << vl8_arm_align_bits; int64_t x86_rip = enc->HighBits.IntegerX86RIP; return {static_cast(arm_pc), static_cast(x86_rip), sizeof(vl16_enc)}; } struct FEX_PACKED vl32_enc { uint8_t Type; int32_t IntegerARMPC; int32_t IntegerX86RIP; }; static_assert(sizeof(vl32_enc) == 9); static inline Decoded Decode(const vl32_enc* enc) { int64_t arm_pc = enc->IntegerARMPC; int64_t x86_rip = enc->IntegerX86RIP; return {static_cast(arm_pc), static_cast(x86_rip), sizeof(vl32_enc)}; } struct FEX_PACKED vl64_enc { uint8_t Type; uint64_t IntegerARMPC; uint64_t IntegerX86RIP; }; static_assert(sizeof(vl64_enc) == 17); // vl8 can hold a two small unsigned integers. // Encoded in 8-bit. constexpr static int64_t vl8_type_header = 0; constexpr static int64_t vl8_arm_min = 1; constexpr static int64_t vl8_arm_max = 16; constexpr static int64_t vl8_arm_align_bits = 2; constexpr static int64_t vl8_arm_shift_mask = (1U << vl8_arm_align_bits) - 1; constexpr static int64_t vl8_pc_min = 1; constexpr static int64_t vl8_pc_max = 8; static bool can_encode_vl8(uint64_t data_arm, uint64_t data_rip) { // GuestPC can only be [1,8] bytes. if (data_rip < vl8_pc_min || data_rip > vl8_pc_max) { return false; } // Unaligned doesn't fit at all. if (data_arm & vl8_arm_shift_mask) { return false; } // HostPC can only be [1,16] instructions. int64_t ShiftedHostPC = data_arm >> vl8_arm_align_bits; if (ShiftedHostPC < vl8_arm_min || ShiftedHostPC > vl8_arm_max) { return false; } return true; } // vl16 can hold a two small signed integers // Encoded in one 16-bit value. constexpr static int64_t vl16_type_header = 0b10; constexpr static int64_t vl16_arm_min = -128; constexpr static int64_t vl16_arm_max = 127; constexpr static int64_t vl16_arm_align_bits = 2; constexpr static int64_t vl16_arm_shift_mask = (1U << vl16_arm_align_bits) - 1; constexpr static int64_t vl16_pc_min = -32; constexpr static int64_t vl16_pc_max = 31; static bool can_encode_vl16(int64_t data_arm, int64_t data_rip) { // GuestPC can only be [-32,31] bytes. if (data_rip < vl16_pc_min || data_rip > vl16_pc_max) { return false; } // Unaligned doesn't fit at all. if (data_arm & vl16_arm_shift_mask) { return false; } // HostPC can only be [-128,127] instructions. int64_t ShiftedHostPC = data_arm >> vl16_arm_align_bits; if (ShiftedHostPC < vl16_arm_min || ShiftedHostPC > vl16_arm_max) { return false; } return true; } // vl32 can hold a two 32-bit integers. // Encoded in 8-bit and two 32-bit values. constexpr static int64_t vl32_type_header = 0b1100'0000; constexpr static int64_t vl32_min = std::numeric_limits::min(); constexpr static int64_t vl32_max = std::numeric_limits::max(); static bool can_encode_vl32(int64_t data_arm, int64_t data_rip) { if (data_rip < vl32_min || data_rip > vl32_max) { return false; } if (data_arm < vl32_min || data_arm > vl32_max) { return false; } return true; } // vl64 can hold a two 64-bit integers. // Encoded in 8-bit and two 64-bit values. constexpr static int64_t vl64_type_header = 0b1110'0000; }; } // namespace FEXCore::Utils ================================================ FILE: FEXCore/docs/CPUBackends.md ================================================ # FEXCore CPU Backends --- FEXCore supports multiple CPU emulation backends. All of which ingest the IR that we have been generating. ## IR Interpreter The first one is the easiest. This just walks the IR list and interprets the IR as it goes through it. It isn't meant to be fast and is for debugging purposes. This is used to easily inspect what is going on with the code generation and making sure logic is sound. Will most likely last in to perpetuity since it isn't exactly difficult to maintain and it is useful to have around ## IR JIT **Not yet implemented** This is meant to be our first JIT of call and will serve multiple purposes. It'll be the JIT that is used for our runtime compilation of code. This means it needs to be fast during compilation and have decent runtime performance. Good chance that we will need to implement multiple of these depending on host architecture with some code reuse between them. This JIT will also be what we use for gathering sampling data for passing off to another JIT for tiered recompilation and offline compilation later. Should use xbyak for our x86-64 host and Vixl for our AArch64 host. For other targets in the future we will see what is available # Future ideas --- * Create an inline ASM or JIT'd dispatcher loop. Will allow our JITs to be more optimal by reserving more registers for guest state. * WebAssmembly or other browser language? * Might allow decent runtime performance of things emulated in a browser. Could be interesting. ================================================ FILE: FEXCore/docs/CustomCPUBackend.md ================================================ # FEXCore custom CPU backends --- Custom CPU backends can be useful for testing purposes or wanting to support situations that FEXCore doesn't currently understand. The FEXCore::Context namespace provides a `SetCustomCPUBackendFactory` function for providing a factory function pointer to the core. This function will be used if the `DEFAULTCORE` configuration option is set to `CUSTOM`. If the guest code creates more threads then the CPU factory function will be invoked for creating a CPUBackend per thread. If you don't want a unique CPUBackend object per thread then that needs to be handled by the user. It's recommended to store the pointers provided to the factory function for later use. `FEXCore::Context::Context*` - Is a pointer to previously generated context object `FEXCore::Core::ThreadState*` - Is a pointer to a thread's state. Lives for as long as the guest thread is alive. To use this factory, one must override the provided `FEXCore::CPU::CPUBackend` class with a custom one. This factory function then should return a newly allocated class. `FEXCore::CPU::CPUBackend::GetName` - Returns an `std::string` for the name of this core `FEXCore::CPU::CPUBackend::CompileCode` - Provides the CPUBackend with potentially an IR and DebugData for compiling code. Returns a pointer that needs to be long lasting to a piece of code that will be executed for the particular RIP. `FEXCore::CPU::CPUBackend::Initialize` - Called after the guest memory is initialized and all state is ready for the code to start initializing. Gets called just before the CPUBackend starts executing code for the first time. ================================================ FILE: FEXCore/docs/Frontend.md ================================================ # FEXCore Frontend --- The FEXCore frontend's job is to translate an incoming x86-64 instruction stream in to a more easily digested version of x86. This effectively expands x86-64 instruction encodings to be more easily ingested later on in the process. This ends up being essential to allowing our IR translation step to be less strenuous. It can decode a "common" expanded instruction format rather than various things that x86-supports. For a simple example, x86-64's primary op table has ALU ops that duplicate themselves at least six times with minor differences between each. The frontend is able to decode a large amount of these ops to the "same" op that the IR translation understands more readily. This works for most instructions that follow a common decoding scheme, although there are instructions that don't follow the rules and must be handled explicitly elsewhere. An example of decoded instructions: ``` 00 C0: add al,al 04 01: add al, 0x1 ``` These two instructions have a different encoding scheme but they are just an add. They end up decoding to a generic format with the same destination operand but different sources. May look subtle but there end up being far more complex cases and we don't want to handle hundreds of instructions differently. After the frontend is done decoding the instruction stream, it passes the output over to the OpDispatcher for translating to our IR. ## Multiblock --- The Frontend has an additional duty. Since it is the main piece of code that understands the guest x86-64 code; It is also what does analysis of control flow to determine if we can end up compiling multiple blocks of guest code. The Frontend already has to determine if it has hit a block ending instruction. This is anything that changes control flow. This feeds in to the analysis system to look at conditional branches to see if we can keep compiling code at the target location in the same functional unit. Short example: ``` test eax, eax jne .Continue ret <--- We can continue past this instruction, which is an unconditional block ender .Continue: ``` These sorts of patterns crop up extensively in compiled code. A large amount of traditional JITs will end up ending the block at any sort of conditional branch instruction. If the analysis can determine the target conditional branch location, we can then know that the code can keep compiling past an unconditional block ender instruction. This works for both backwards branches and forward branches. ### Additional reading --- There are other emulators out there that implement multiblock JIT compilation with some success. The best example of this that I know of is the [Dolphin GameCube and Wii Emulator](https://github.com/dolphin-emu/dolphin) Where I implemented the initial multiblock implementation. One of the major limitations with a console emulator is that you can run in to infinite loops on backedges when using multiblock compilation. This is due to console emulation being able to run an infinite loop and let Interrupts or some other state cause it to break out. Luckily since we are a userspace emulator we don't have to deal with this problem. If an application has written an infinite loop, then without another thread running, it'll be a true infinite loop. Additionally luckily is that we are going to emulate the strong memory model of x86-64 and also support true threads, this will mean that we don't need to do any manual thread scheduling in our emulator and switch between virtual threads. ================================================ FILE: FEXCore/docs/IR.md ================================================ # FEXCore IR --- The IR for the FEXCore is an SSA based IR that is generated from the incoming x86-64 assembly. SSA is quite nice to work with when translating the x86-64 code to the IR, when optimizing that code with custom optimization passes, and also passing that IR to our CPU backends. ## Emulation IR considerations * We have explicitly sized IR variables * Supports traditional element sizes of 1,2,4,8 bytes and some 16byte ops * Supports arbitrary number of vector elements * The op determines if something is float or integer based. * Clear separation of scalar IR ops and vector IR ops * ex, MUL versus VMUL * We have explicit Load/Store context IR ops * This allows us to have a clear separation between guest memory and tracked x86-64 state * We have an explicit CPUID IR op * This allows us to return fairly complex data (4 registers of data) and also having an easier optimization for constant CPUID functions * So if we const-prop the CPUID function then it'll just const-prop further along * We have an explicit syscall op * The syscall op is fairly complex as well, same with CPUID that if the syscall function is const-prop then we can directly call the syscall handler * Can save overhead by removing call overheads * The IR supports branching from one block to another * Has a conditional branch instruction that either branches to the target branch or falls through to the next block * Has an unconditional branch to explicitly jump to a block instead of falling through * **There is a desire to follow LLVM semantics around block limitations but it isn't currently strictly enforced** * Supports a debug `Print` Op for printing out values for debug viewing * Supports explicit Load/Store memory IR ops * This is for accessing guest memory and will do the memory offset translation in to the VM's memory space * This is done by just adding the VM memory base to the 64bit address passed in * This is done in a manner that the application **can** escape from the VM and isn't meant to be safe * There is an option for JITs to validate the memory region prior to accessing for ensuring correctness * IR is generated from a JSON file, fairly straightforward to extend. * Read the python generation file to determine the extent of what it can do ## IR function considerations The first SSA node is a special case node that is considered invalid. This means %0 will always be invalid for "null" node checks The first real SSA node also has to be a IRHeader node. This means it is safe to assume that %1 will always be an IRHeader. ```(%%1) IRHeader 0x41a9a0, %%2, 5``` The header provides information about that function like the entry point address. Additionally it also points to the first `CodeBlock` IROp ```(%%2) CodeBlock %%7, %%168, %%3``` * The `CodeBlock` Op is a jump target and must be treated as if it'll be jumped to from other blocks * It contains pointers to the starting op and ending op and they are inclusive * It also contains a pointer to the next CodeBlock in a singly linked list * The last CodeBlock will point to the InvalidNode as the next block ### Example code block ``` (%%3) CodeBlock %%169, %%173, %%4 (%%169) BeginBlock %3 %170 i64 = Constant 0x41a9e1 (%%171) StoreContext %170 i64, 0x8, 0x0 (%%172) ExitFunction (%%173) EndBlock %3 ``` * BeginBlock points back to the CodeBlock SSA which helps with iterating across multiple blocks * EndBlock the ending op of a CodeBlock and also points back to the CodeBlock SSA. * ExitFunction will leave the function immediately and return back to the dispatcher * Every IR Op has an SSA value associated with it used for tracking the op itself * If the IROp doesn't have a real destination then it is invalid to use it as an argument in most other ops ## In-memory representation The in-memory representation of the IR may be a bit confusing when initially viewed and once dealing with optimizations then it may be confusing as well. Currently the IR Generation is tied to the `OpDispatchBuilder` class. This class handles translating decoded x86 to our IR representation. When generating IR inside of the `OpDispatchBuilder` it is straight forward, just call the IR generation ops. ### FEXCore::IR::IntrusiveAllocator This is an intrusive allocator that is used by the `OpDispatchBuilder` for storing IR data. It is a simple linear arena allocator without resizing capabilities. ### OpDispatchBuilder OpDispatchBuilder provides `IRListView ViewIR()` for handling the IR outside of the class: * Returns a wrapper container class the allows you to view the IR. This doesn't take ownership of the IR data. * If the OpDispatcherBuilder changes its IR then changes are also visible to this class This class uses two IntrusiveAllocator objects for tracking IR data. `ListData` and `Data` are the object names. * `ListData` is for tracking the doubly linked list of nodes * This ONLY allocates `FEXCore::IR::OrderedNode` objects * When an OrderedNode is allocated its allocation location (NodeOffset) is just the offset from the base pointer * This allows us to only use uint32_t memory offsets to compact the IR * Additionally using offsets allows us the freedom to freely move our IR in memory without costly pointer adjustment * This means everything is fixed size allocated (SSA Node number calculation is just `AllocationOffset / sizeof(OrderedNode)` * OrderedNodes are what the SSA arguments are pointing to in the end ### OrderedNode This is a doubly linked list of all of our IR nodes. This allows us to walk forward or backward over the IR and they must be ordered correctly to ensure dominance of SSA values. * Contains `OrderedNodeHeader` * Contains `OpNodeWrapper Value` * Points to the `IROp_Header` backing op for this SSA node * Contains `OrderedNodeWrapper Next` * Points to the next `OrderedNode` * Contains `OrderedNodeWrapper Previous` * Points to the previous `OrderedNode` * Contains the NumUses * This allows us to easily walk to the list backwards and DCE the ops that have NumUses == 0 * `IROp_Header *Op(uintptr_t Base)` * Allows you to get the backing IR data for this SSA value ### NodeWrapperBase - Type for `OrderedNodeHeader` and `OpNodeWrapper` * `using OpNodeWrapper = NodeWrapperBase` * `using OrderedNodeWrapper = NodeWrapperBase` * This is a class to let you more easily convert NodeOffsets in to their real backing pointer * `GetNode(uintptr_t Base)` allows you to pass in the base pointer from the backing Intrusive allocator and get the object * **This can be confusing** * A good rule of thumb is to only ever use `GetNode(ListDataBegin)` with OrderedNodeWrapper * Then once you have the `OrderedNode*` from GetNode, Use the `Op(IRDataBegin)` function to get the IR data. * I do **NOT** recommend using `GetNode` directly from `OpNodeWrapper` as it is VERY easy to mess it up ### NodeIterator Provides a fairly straightforward interface that allows easily walking the IR nodes with C++ increment and decrement operations. Only iterates over a single block #### Example usage ```cpp IR::NodeIterator After = ...; IR::NodeIterator End = ...; while (After != End) { // NodeIterator() returns a pair of pointers to the OrderedNode and IROp data // You can unpack the result with structured bindings auto [CodeNode, IROp] = After(); // IROp_Header contains a bunch of information about the IR object // We can convert it with the object's C or CW functions switch(IROp->Op) { case IR::OP_ADD: { FEXCore::IR::IROp_Add const *Op = IROp->C(); /* We can now access members inside of IROp_Add that were previously unavailable You can still access the header definitions from Op->Header */ break; } /* ... */ } // Go to the next IR Op ++After; } ``` ### AllNodesIterator This is like NodeIterator, except that it will cross block boundaries. ### IRListView.GetBlocks() Provides a range for easy iterating over all the blocks in a multi-block with NodeIterator #### Example usage ```c++ for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { // Do stuff for each block } ``` ### IRListView.GetCode(BlockNode) Provides a range for easy iterating over all the code in a block #### Example usage ```c++ for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { // Do stuff for each op switch(IROp->Op) { case IR::OP_ADD: { FEXCore::IR::IROp_Add const *Op = IROp->C(); // Do stuff for each Add op. break; } } } ``` ### IRListView.GetAllCode() Like GetCode, except it uses AllNodesIterator to allow easy iterating over every single op in the entire Multiblock #### Example usage ```c++ for (auto [CodeNode, IROp] : CurrentIR.GetAllCode()) { // Do stuff for each op } ``` ## JSON file An example of what the IR json looks like ``` "StoreContext": { "SSAArgs": "1", "Args": [ "uint8_t", "Size", "uint32_t", "Offset" ] }, ``` The json entry name will be the name of the IR op and the dispatcher function. This means you'll get a `_Add(...)` dispatcher function generated ### JSON IR element options * `HasDest` * This is used on ops that return a value. Used for tracking of if ops return data * `SSAArgs` * These are the number of arguments that the op consumes that are SSA based * Needs to come from previous ops that had a destination * `SSANames` * Allows you to name the SSA arguments in an op * Otherwise the Op names will only be able to be accessed from the Header of the IR through its arguments array * `Args` * These are defined arguments that are stored in the IR encoding that aren't SSA based * Useful for things that are constant encoded and won't change after the fact * `FixedDestSize` * This allows you to override the op's destination size in bytes * Most ops with implicitly calculate their destination size through the maximum sizes of the IR arguments passed in * `DestSize` * This allows an IR size override that isn't just a size in bytes * This can let the size of the op be another argument or something more extensive * `RAOverride` * This allows an op to take regular SSA arguments (So optimization passes will still be aware of them) but also not have them be register allocated * Useful for block handling ops, where blocks aren't something that get register allocated but still need to have their uses tracked * `HelperGen` * If there is a complex IR Op that needs to be defined but you don't want an automatic dispatcher generated then this disables the generation of the dispatcher * `Last` * This is a special element only used for the last element in the list ================================================ FILE: FEXCore/docs/MemoryModelEmulation.md ================================================ # What is x86-TSO and what is different compared to ARM's weak memory model? x86's memory model is a very strictly coherent memory model that effectively mandates that all memory accesses are "atomic". While atomicity is actually a bit more strict, we actually need to emulate it in ARM using atomic instructions. We are also required to emulate this strictness with unaligned accesses, which is due to x86 CPUs allowing unaligned atomics for "free" within a cacheline. Intel also takes this a step more and allowing full atomics with a feature called "split-locks", AMD gains this same feature in Zen 5. # Emulating loads Due to x86 SIB addressing, this can happen on most instructions. FEX emulates these in a variety of ways depending on features. Most instructions are emulated with an atomic instruction but we also implement a feature called "half-barrier" atomics for unaligned atomics. ## Base ARMv8.0 - Addressing limitations - Register only This is emulated using an atomic load instruction plus a nop. - On unaligned access the code gets backpatched to a non-atomic load plus a memory barrier ## FEAT_LRCPC - Addressing limitations - Register only This matches the base ARMv8.0 implementation but adds new instructions that match x86-TSO behaviour, making the emulation slightly quicker. - On unaligned access it still gets backpatched to non-atomic load plus a memory barrier. ## FEAT_LRCPC2 - Addressing limitations - Register plus 9-bit signed immediate (-256, 255) Adds some new instructions that allow immediate encoding inside of the previous LRCPC instructions ## FEAT_LRCPC3 Adds a handful of GPR instructions that aren't super interesting FEX doesn't currently implement these since no hardware supports it. - ldapr - Post-index load for stack - ldiapr - Post-index load pair for stack - stilp - pre-index store pair for stack - stlr - pre-index store for stack # Emulating stores Again due to x86 SIB addressing, this can also happen on most instructions. There are less options for FEX with this extension, so in most cases this just turns in to an atomic store with half-barrier backpatching for unaligned accesses ## FEAT_LRCPC, FEAT_LRCPC2 Adds nothing for emulating stores ## FEAT_LRCPC3 # Emulating atomic instructions x86 has atomic memory operations that can do a variety of operations. For unaligned atomic operations FEX will emulate the operation inside the signal handler if it happens to be unaligned. ## CASPair - cmpxchg ## Base ARMv8.0 - Addressing limitations - Register only This is emulated with a ldaxp+stlxp pair of instructions. ## FEAT_LSE - Addressing limitations - Register only Adds a new caspal instruction that does the operation almost exactly like x86. ## CAS - cmpxchg8b/cmpxchg16b ## Base ARMv8.0 - Addressing limitations - Register only Similar to CASPair but now only uses a ldaxr+stlxr pair ## FEAT_LSE - Addressing limitations - Register only Similar to CASPair adds a new casal instruction that operates basically like x86 # AtomicFetch ## Op from the following list - Add - Sub - And - CLR - Or - Xor - Neg - Swap ## Base ARMv8.0 - Addressing limitations - Register only All operations get emulated with an ldaxr+stlxr+ instruction ## FEAT_LSE - Addressing limitations - Register only Almost all operations now have a native atomic memory operation instruction. The only outlier is atomicNeg which doesn't have an LSE equivalent and uses the ARMv8.0 implementation. # Vector loads Since almost all memory accesses on x86 are TSO, this includes vector operations. ## Base ARMv8.0 - Addressing limitations - Register plus 9-bit signed immediate (-256, 255) - Register plus 12-bit unsigned scaled immediate (Scaled by access size) Emulated using half-barriers, which means a load+dmb ## FEAT_LRCPC3 - LDAP1 added for element loads. Register only address encoding - LDAPUR added for vector register loads, supports 9-bit simm offset # Vector stores Just like loads, these are emulated using half-barriers ## Base ARMv8.0 - Addressing limitations - Register plus 9-bit signed immediate (-256, 255) - Register plus 12-bit unsigned scaled immediate (Scaled by access size) Emulated using half-barriers, which means a dmb+str ## FEAT_LRCPC3 - STL1 added for element stores. Register only address encoding - STLUR added for vector register stores, supports 9-bit simm offset # Addressing limitations depending on operating mode ## GPR loadstores ### TSO Emulation disabled - Register only (ldr/str) - Register + Register + scale (ldr/str) - Register + 9-bit simm (ldur/stru) - Register + 12-bit unsigned scaled imm (ldr/str) ### TSO Emulation enabled - Register only (ldar/stlr) - Register only (ldapr/stlr) - FEAT_LRCPC - Register + 9-bit simm (ldapr/stlur) - FEAT_LRCPC2 ## Vector loadstores ### TSO Emulation disabled - Register only (ldr/str) - Register + Register + scale (ldr/str) - Register + 9-bit simm (ldur/stru) - Register + 12-bit unsigned scaled imm (ldr/str) ### TSO Emulation enabled - Same as TSO emulation disabled due to half-barrier implementation ### TSO Emulation enabled (FEAT_LRCPC3) - Register only (ldap1/stl1) - Element loadstore - Register + 9-bit simm (ldapur/stlur) ## Atomic memory operations Always TSO emulation enabled, always register only. ================================================ FILE: FEXCore/docs/OpDispatcher.md ================================================ # FEXCore OpDispatcher --- The OpDispatcher is the step of the recompiler that takes the output from the Frontend and translates it to our IR. Since the x86-64 instruction set is so large (>1000 instructions in the current FEXCore tables) we need to reduce this down to something more manageable. We will ingest our decoded x86-64 instructions and translate them down to more basic IR operations. The number of IR ops are currently in the dozens which is a lot easier to handle. Once we have translated to the IR then we need to pass the IR over to optimization passes or our JIT cores. Ex: ``` mov rax,0x1 mov rdi,0x1 mov rsi,0x20 mov rdx,0x1 syscall hlt ``` Translates to the IR of: ``` BeginBlock %8 i32 = Constant 0x1 StoreContext 0x8, 0x8, %8 %64 i32 = Constant 0x1 StoreContext 0x8, 0x30, %64 %120 i32 = Constant 0x1f StoreContext 0x8, 0x28, %120 %176 i32 = Constant 0x1 StoreContext 0x8, 0x20, %176 %232 i64 = LoadContext 0x8, 0x8 %264 i64 = LoadContext 0x8, 0x30 %296 i64 = LoadContext 0x8, 0x28 %328 i64 = LoadContext 0x8, 0x20 %360 i64 = LoadContext 0x8, 0x58 %392 i64 = LoadContext 0x8, 0x48 %424 i64 = LoadContext 0x8, 0x50 %456 i64 = Syscall %232, %264, %296, %328, %360, %392, %424 StoreContext 0x8, 0x8, %456 BeginBlock EndBlock 0x1e ExitFunction ``` ### Multiblock --- An additional duty of the OpDispatcher is to handle the metadata that the Frontend provides for supporting multiblock. The IR provides most of the functionality required for supporting robust branching and function creation required for generating large blocks of code translated from x86-64 emulation. This is required since in the ideal situation we will be doing function level translation of x86-64 guest code to our IR. The IR is currently lacking any idea of flags or PHI nodes, which can be problematic when optimizing branch heavy code. The good thing is that the LLVM JIT can use a mem to reg pass to minimize a large number of this code. It **will** be required to improve the IR further once the runtime JIT becomes a higher priority ================================================ FILE: FEXCore/include/FEXCore/Config/Config.h ================================================ // SPDX-License-Identifier: MIT #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace FEXCore::Config { namespace Handler { static inline std::optional SMCCheckHandler(std::string_view Value) { if (Value == "none") { return "0"; } else if (Value == "mtrack") { return "1"; } else if (Value == "full") { return "2"; } return "0"; } } // namespace Handler enum ConfigOption { #define OPT_BASE(type, group, enum, json, default) CONFIG_##enum, #include }; #define ENUMDEFINES #include enum ConfigSMCChecks { CONFIG_SMC_NONE, CONFIG_SMC_MTRACK, CONFIG_SMC_FULL, }; enum class LayerType { LAYER_GLOBAL_MAIN, ///< /usr/share/fex-emu/Config.json by default LAYER_MAIN, LAYER_ARGUMENTS, LAYER_GLOBAL_STEAM_APP, LAYER_GLOBAL_APP, LAYER_LOCAL_STEAM_APP, LAYER_LOCAL_APP, LAYER_USER_OVERRIDE, LAYER_ENVIRONMENT, LAYER_TOP, }; template static inline std::optional EnumParser(const ArrayPairType& EnumPairs, const std::string_view View) { uint64_t EnumMask {}; auto Results = std::from_chars(View.data(), View.data() + View.size(), EnumMask); if (Results.ec == std::errc()) { // If the data is a valid number, just pass it through. return std::nullopt; } auto Begin = 0; auto End = View.find_first_of(','); std::string_view Option = View.substr(Begin, End); while (Option.size() != 0) { auto EnumValue = std::find_if(EnumPairs.begin(), EnumPairs.end(), [Option](const PairTypes& Value) -> bool { return Value.first == Option; }); if (EnumValue == EnumPairs.end()) { LogMan::Msg::IFmt("Skipping Unknown option: {}", Option); } else { EnumMask |= FEXCore::ToUnderlying(EnumValue->second); } if (End == std::string::npos) { break; } Begin = End + 1; End = View.find_first_of(',', Begin); Option = View.substr(Begin, End - Begin); } return fextl::fmt::format("{}", EnumMask); } using StringArrayType = fextl::list; namespace detail { template struct ConfigOptionInfo; #define DEFINE_METAINFO(type, enum, default) \ template<> \ struct ConfigOptionInfo { \ using Type = type; \ static auto Default() { \ extern default; \ return enum; \ } \ }; #define OPT_BASE(type, group, enum, json, default) DEFINE_METAINFO(type, enum, const type enum) #define OPT_STR(group, enum, json, default) DEFINE_METAINFO(fextl::string, enum, const std::string_view enum) #define OPT_STRARRAY(group, enum, json, default) DEFINE_METAINFO(StringArrayType, enum, const std::string_view enum) #include } // namespace detail FEX_DEFAULT_VISIBILITY void SetDataDirectory(std::string_view Path, bool Global); FEX_DEFAULT_VISIBILITY void SetConfigDirectory(const std::string_view Path, bool Global); FEX_DEFAULT_VISIBILITY void SetConfigFileLocation(std::string_view Path, bool Global); FEX_DEFAULT_VISIBILITY const fextl::string& GetDataDirectory(bool Global = false); FEX_DEFAULT_VISIBILITY const fextl::string& GetConfigDirectory(bool Global); FEX_DEFAULT_VISIBILITY const fextl::string& GetConfigFileLocation(bool Global = false); FEX_DEFAULT_VISIBILITY fextl::string GetApplicationConfig(const std::string_view Program, bool Global); using LayerValue = std::variant< fextl::string, StringArrayType, uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t, int64_t, bool >; using LayerOptions = fextl::unordered_map; class FEX_DEFAULT_VISIBILITY Layer { public: explicit Layer(const LayerType _Type); virtual ~Layer(); virtual void Load() = 0; bool OptionExists(ConfigOption Option) const { return OptionMap.find(Option) != OptionMap.end(); } std::optional All(ConfigOption Option) { const auto it = OptionMap.find(Option); if (it == OptionMap.end()) { return std::nullopt; } auto& Value = it->second; LOGMAN_THROW_A_FMT(std::holds_alternative(Value), "Tried to get config of invalid type!"); return &std::get(Value); } std::optional Get(ConfigOption Option) { const auto it = OptionMap.find(Option); if (it == OptionMap.end()) { return std::nullopt; } auto& Value = it->second; LOGMAN_THROW_A_FMT(std::holds_alternative(Value), "Tried to get config of invalid type!"); return &std::get(Value); } // Set will overwrite the object with a fextl::string without tests. void Set(ConfigOption Option, const char* Data) { LOGMAN_THROW_A_FMT(Data != nullptr, "Data can't be null"); OptionMap[Option].emplace(fextl::string(Data)); } void Set(ConfigOption Option, std::string_view Data) { OptionMap[Option].emplace(fextl::string(Data)); } void Set(ConfigOption Option, fextl::string Data) { OptionMap[Option].emplace(std::move(Data)); } void Set(ConfigOption Option, std::optional Data) { if (Data) { OptionMap[Option].emplace(std::move(*Data)); } } // AppendStrArrayValue will append strings to its StringArrayType. // If the value was previously a different type, then throw an assert. void AppendStrArrayValue(ConfigOption Option, std::string_view Data) { auto it = OptionMap.find(Option); if (it == OptionMap.end()) { // If the option didn't exist as a StringArrayType yet, emplace it. it = OptionMap.emplace(Option, StringArrayType {}).first; } auto& Value = it->second; LOGMAN_THROW_A_FMT(std::holds_alternative(Value), "Tried to get config of invalid type!"); std::get(Value).emplace_back(Data); } void Erase(ConfigOption Option) { OptionMap.erase(Option); } LayerType GetLayerType() const { return Type; } const LayerOptions& GetOptionMap() const { return OptionMap; } protected: const LayerType Type; LayerOptions OptionMap; }; FEX_DEFAULT_VISIBILITY void Initialize(); FEX_DEFAULT_VISIBILITY void Shutdown(); FEX_DEFAULT_VISIBILITY void Load(); FEX_DEFAULT_VISIBILITY void ReloadMetaLayer(); FEX_DEFAULT_VISIBILITY fextl::string FindContainer(); FEX_DEFAULT_VISIBILITY fextl::string FindContainerPrefix(); FEX_DEFAULT_VISIBILITY void AddLayer(fextl::unique_ptr _Layer); FEX_DEFAULT_VISIBILITY bool Exists(ConfigOption Option); FEX_DEFAULT_VISIBILITY std::optional All(ConfigOption Option); template FEX_DEFAULT_VISIBILITY std::optional GetConv(ConfigOption Option); FEX_DEFAULT_VISIBILITY std::optional Get(ConfigOption Option); FEX_DEFAULT_VISIBILITY void Set(ConfigOption Option, std::string_view Data); FEX_DEFAULT_VISIBILITY void Erase(ConfigOption Option); template class FEX_DEFAULT_VISIBILITY Value { public: // Single value type. template requires (std::is_fundamental_v || std::is_same_v) Value(FEXCore::Config::ConfigOption Option, TT Default) { ValueData = GetIfExists(Option, Default); } template requires (std::is_fundamental_v || std::is_same_v) Value(FEXCore::Config::ConfigOption Option, std::string_view Default) { ValueData = GetIfExists(Option, Default); } operator T() const { return ValueData; } T operator()() const requires (std::is_fundamental_v) { return ValueData; } const fextl::string& operator()() const requires (std::is_same_v) { return ValueData; } Value(T Value) requires (!std::is_same_v) { ValueData = std::move(Value); } // Array value types. Value(FEXCore::Config::ConfigOption Option, std::string_view) requires (std::is_same_v) { GetListIfExists(Option, &ValueData); } StringArrayType& All() requires (std::is_same_v) { return ValueData; } private: T ValueData {}; static T GetIfExists(FEXCore::Config::ConfigOption Option, T Default); static T GetIfExists(FEXCore::Config::ConfigOption Option, std::string_view Default); static void GetListIfExists(FEXCore::Config::ConfigOption Option, StringArrayType* List); }; /** * Wrapper around Value that automatically picks the default for the given ConfigOption */ template struct FEX_DEFAULT_VISIBILITY Getter : public Value::Type> { using OptionInfo = detail::ConfigOptionInfo