From da08eda5065f859989703294ffe027641fc6f62b Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Mon, 7 Nov 2022 03:23:46 +0100 Subject: [PATCH] PPCRec: Emit x86 movd for non-AVX + more restructuring --- src/Cafe/CMakeLists.txt | 1 + src/Cafe/HW/Espresso/Recompiler/IML/IML.h | 16 +- .../HW/Espresso/Recompiler/IML/IMLDebug.cpp | 1 + .../Recompiler/IML/IMLRegisterAllocator.cpp | 22 +- .../Recompiler/IML/IMLRegisterAllocator.h | 0 .../HW/Espresso/Recompiler/IML/IMLSegment.h | 83 ++- .../HW/Espresso/Recompiler/PPCRecompiler.cpp | 128 ++-- .../HW/Espresso/Recompiler/PPCRecompiler.h | 95 +-- .../HW/Espresso/Recompiler/PPCRecompilerIml.h | 5 +- .../Recompiler/PPCRecompilerImlGen.cpp | 109 +-- .../Espresso/Recompiler/PPCRecompilerX64.cpp | 633 ++++++++++-------- .../HW/Espresso/Recompiler/PPCRecompilerX64.h | 6 + .../Recompiler/PPCRecompilerX64FPU.cpp | 74 +- 13 files changed, 589 insertions(+), 584 deletions(-) create mode 100644 src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 02f5c7c9..ec36e147 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -76,6 +76,7 @@ add_library(CemuCafe HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp HW/Espresso/Recompiler/IML/IMLOptimizer.cpp HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp + HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IML.h b/src/Cafe/HW/Espresso/Recompiler/IML/IML.h index 4bee5c5d..06f39815 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IML.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IML.h @@ -16,16 +16,16 @@ void IMLAnalyzer_GetCRTracking(IMLInstruction* imlInstruction, PPCRecCRTracking_ // optimizer passes // todo - rename -bool PPCRecompiler_reduceNumberOfFPRRegisters(ppcImlGenContext_t* ppcImlGenContext); -bool PPCRecompiler_manageFPRRegisters(ppcImlGenContext_t* ppcImlGenContext); -void PPCRecompiler_removeRedundantCRUpdates(ppcImlGenContext_t* ppcImlGenContext); -void PPCRecompiler_optimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext); -void PPCRecompiler_optimizeDirectIntegerCopies(ppcImlGenContext_t* ppcImlGenContext); -void PPCRecompiler_optimizePSQLoadAndStore(ppcImlGenContext_t* ppcImlGenContext); -void PPCRecompiler_reorderConditionModifyInstructions(ppcImlGenContext_t* ppcImlGenContext); +bool PPCRecompiler_reduceNumberOfFPRRegisters(struct ppcImlGenContext_t* ppcImlGenContext); +bool PPCRecompiler_manageFPRRegisters(struct ppcImlGenContext_t* ppcImlGenContext); +void PPCRecompiler_removeRedundantCRUpdates(struct ppcImlGenContext_t* ppcImlGenContext); +void PPCRecompiler_optimizeDirectFloatCopies(struct ppcImlGenContext_t* ppcImlGenContext); +void PPCRecompiler_optimizeDirectIntegerCopies(struct ppcImlGenContext_t* ppcImlGenContext); +void PPCRecompiler_optimizePSQLoadAndStore(struct ppcImlGenContext_t* ppcImlGenContext); +void PPCRecompiler_reorderConditionModifyInstructions(struct ppcImlGenContext_t* ppcImlGenContext); // register allocator -void IMLRegisterAllocator_AllocateRegisters(ppcImlGenContext_t* ppcImlGenContext); +void IMLRegisterAllocator_AllocateRegisters(struct ppcImlGenContext_t* ppcImlGenContext); // debug void IMLDebug_DumpSegment(struct IMLSegment* imlSegment, sint32 segmentIndex, bool printLivenessRangeInfo = false); diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp index c64fc513..69d8e1b7 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp @@ -4,6 +4,7 @@ #include "IMLRegisterAllocatorRanges.h" #include "util/helpers/StringBuf.h" +#include "../PPCRecompiler.h" const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml) { diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp index cf309c4e..a75f634f 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp @@ -750,8 +750,8 @@ void _analyzeRangeDataFlow(raLivenessSubrange_t* subrange) void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) { - sint16 virtualReg2PhysReg[PPC_REC_MAX_VIRTUAL_GPR]; - for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) + sint16 virtualReg2PhysReg[IML_RA_VIRT_REG_COUNT_MAX]; + for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) virtualReg2PhysReg[i] = -1; raLiveRangeInfo_t liveInfo; @@ -848,7 +848,7 @@ void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext, replaceGpr[f] = -1; continue; } - if (virtualRegister >= PPC_REC_MAX_VIRTUAL_GPR) + if (virtualRegister >= IML_RA_VIRT_REG_COUNT_MAX) assert_dbg(); replaceGpr[f] = virtualReg2PhysReg[virtualRegister]; cemu_assert_debug(replaceGpr[f] >= 0); @@ -860,7 +860,7 @@ void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext, } // expire infinite subranges (subranges that cross the segment border) sint32 storeLoadListLength = 0; - raLoadStoreInfo_t loadStoreList[PPC_REC_MAX_VIRTUAL_GPR]; + raLoadStoreInfo_t loadStoreList[IML_RA_VIRT_REG_COUNT_MAX]; for (sint32 f = 0; f < liveInfo.liveRangesCount; f++) { raLivenessSubrange_t* liverange = liveInfo.liveRangeList[f]; @@ -1007,7 +1007,7 @@ bool _isRangeDefined(IMLSegment* imlSegment, sint32 vGPR) void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) { - for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) + for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) { imlSegment->raDistances.reg[i].usageStart = INT_MAX; imlSegment->raDistances.reg[i].usageEnd = INT_MIN; @@ -1027,7 +1027,7 @@ void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext, sint32 virtualRegister = gprTracking.gpr[t]; if (virtualRegister < 0) continue; - cemu_assert_debug(virtualRegister < PPC_REC_MAX_VIRTUAL_GPR); + cemu_assert_debug(virtualRegister < IML_RA_VIRT_REG_COUNT_MAX); imlSegment->raDistances.reg[virtualRegister].usageStart = std::min(imlSegment->raDistances.reg[virtualRegister].usageStart, index); // index before/at instruction imlSegment->raDistances.reg[virtualRegister].usageEnd = std::max(imlSegment->raDistances.reg[virtualRegister].usageEnd, index + 1); // index after instruction } @@ -1086,7 +1086,7 @@ raLivenessSubrange_t* PPCRecRA_convertToMappedRanges(ppcImlGenContext_t* ppcImlG void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) { - for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) + for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) { if (_isRangeDefined(imlSegment, i) == false) continue; @@ -1096,8 +1096,8 @@ void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext, PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment, i, range); } // create lookup table of ranges - raLivenessSubrange_t* vGPR2Subrange[PPC_REC_MAX_VIRTUAL_GPR]; - for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) + raLivenessSubrange_t* vGPR2Subrange[IML_RA_VIRT_REG_COUNT_MAX]; + for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) { vGPR2Subrange[i] = imlSegment->raInfo.linkedList_perVirtualGPR[i]; #ifdef CEMU_DEBUG_ASSERT @@ -1257,7 +1257,7 @@ void PPCRecRA_checkAndTryExtendRange(ppcImlGenContext_t* ppcImlGenContext, IMLSe void PPCRecRA_mergeCloseRangesForSegmentV2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) { - for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries + for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries { if (imlSegment->raDistances.reg[i].usageStart == INT_MAX) continue; // not used @@ -1334,7 +1334,7 @@ void PPCRecRA_extendRangesOutOfLoopsV2(ppcImlGenContext_t* ppcImlGenContext) continue; // extend looping ranges into all exits (this allows the data flow analyzer to move stores out of the loop) - for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries + for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries { if (imlSegment->raDistances.reg[i].usageEnd != RA_INTER_RANGE_END) continue; // range not set or does not reach end of segment diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h new file mode 100644 index 00000000..e69de29b diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h index f9fccb0e..1e27d303 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h @@ -1,7 +1,84 @@ #pragma once #include "IMLInstruction.h" -#include "Cafe/HW/Espresso/Recompiler/PPCRecompiler.h" // remove once dependency is gone +#define IML_RA_VIRT_REG_COUNT_MAX 40 // should match PPC_REC_MAX_VIRTUAL_GPR -> todo: Make this dynamic + +struct IMLSegmentPoint +{ + sint32 index; + struct IMLSegment* imlSegment; + IMLSegmentPoint* next; + IMLSegmentPoint* prev; +}; + +struct raLivenessLocation_t +{ + sint32 index; + bool isRead; + bool isWrite; + + raLivenessLocation_t() = default; + + raLivenessLocation_t(sint32 index, bool isRead, bool isWrite) + : index(index), isRead(isRead), isWrite(isWrite) {}; +}; + +struct raLivenessSubrangeLink_t +{ + struct raLivenessSubrange_t* prev; + struct raLivenessSubrange_t* next; +}; + +struct raLivenessSubrange_t +{ + struct raLivenessRange_t* range; + IMLSegment* imlSegment; + IMLSegmentPoint start; + IMLSegmentPoint end; + // dirty state tracking + bool _noLoad; + bool hasStore; + bool hasStoreDelayed; + // next + raLivenessSubrange_t* subrangeBranchTaken; + raLivenessSubrange_t* subrangeBranchNotTaken; + // processing + uint32 lastIterationIndex; + // instruction locations + std::vector list_locations; + // linked list (subranges with same GPR virtual register) + raLivenessSubrangeLink_t link_sameVirtualRegisterGPR; + // linked list (all subranges for this segment) + raLivenessSubrangeLink_t link_segmentSubrangesGPR; +}; + +struct raLivenessRange_t +{ + sint32 virtualRegister; + sint32 physicalRegister; + sint32 name; + std::vector list_subranges; +}; + +struct PPCSegmentRegisterAllocatorInfo_t +{ + // analyzer stage + bool isPartOfProcessedLoop{}; // used during loop detection + sint32 lastIterationIndex{}; + // linked lists + raLivenessSubrange_t* linkedList_allSubranges{}; + raLivenessSubrange_t* linkedList_perVirtualGPR[IML_RA_VIRT_REG_COUNT_MAX]{}; +}; + +struct PPCRecVGPRDistances_t +{ + struct _RegArrayEntry + { + sint32 usageStart{}; + sint32 usageEnd{}; + }reg[IML_RA_VIRT_REG_COUNT_MAX]; + bool isProcessed[IML_RA_VIRT_REG_COUNT_MAX]{}; +}; struct IMLSegment { @@ -39,11 +116,9 @@ struct IMLSegment PPCRecVGPRDistances_t raDistances{}; bool raRangeExtendProcessed{}; // segment points - ppcRecompilerSegmentPoint_t* segmentPointList{}; - + IMLSegmentPoint* segmentPointList{}; bool HasSuffixInstruction() const; IMLInstruction* GetLastInstruction(); - }; diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp index 09f10956..6c3cbde3 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp @@ -14,6 +14,8 @@ #include "util/helpers/helpers.h" #include "util/MemMapper/MemMapper.h" +#include "Cafe/HW/Espresso/Recompiler/IML/IML.h" + struct PPCInvalidationRange { MPTR startAddress; @@ -127,6 +129,7 @@ void PPCRecompiler_attemptEnter(PPCInterpreter_t* hCPU, uint32 enterAddress) PPCRecompiler_enter(hCPU, funcPtr); } } +bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext); PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PPCRange_t range, std::set& entryAddresses, std::vector>& entryPointsOut) { @@ -153,21 +156,27 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t(); ppcRecFunc->ppcAddress = range.startAddress; ppcRecFunc->ppcSize = range.length; + // generate intermediate code ppcImlGenContext_t ppcImlGenContext = { 0 }; bool compiledSuccessfully = PPCRecompiler_generateIntermediateCode(ppcImlGenContext, ppcRecFunc, entryAddresses); if (compiledSuccessfully == false) { - // todo: Free everything - PPCRecompiler_freeContext(&ppcImlGenContext); delete ppcRecFunc; - return NULL; + return nullptr; } + + // apply passes + if (!PPCRecompiler_ApplyIMLPasses(ppcImlGenContext)) + { + delete ppcRecFunc; + return nullptr; + } + // emit x64 code bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext); if (x64GenerationSuccess == false) { - PPCRecompiler_freeContext(&ppcImlGenContext); return nullptr; } @@ -183,11 +192,82 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP entryPointsOut.emplace_back(ppcEnterOffset, x64Offset); } - - PPCRecompiler_freeContext(&ppcImlGenContext); return ppcRecFunc; } +void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext); + +bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext) +{ + PPCRecompiler_FixLoops(ppcImlGenContext); + + // isolate entry points from function flow (enterable segments must not be the target of any other segment) + // this simplifies logic during register allocation + PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext); + + // if GQRs can be predicted, optimize PSQ load/stores + PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext); + + // count number of used registers + uint32 numLoadedFPRRegisters = 0; + for (uint32 i = 0; i < 255; i++) + { + if (ppcImlGenContext.mappedFPRRegister[i]) + numLoadedFPRRegisters++; + } + + // insert name store instructions at the end of each segment but before branch instructions + for (IMLSegment* segIt : ppcImlGenContext.segmentList2) + { + if (segIt->imlList.size() == 0) + continue; // ignore empty segments + // analyze segment for register usage + IMLUsedRegisters registersUsed; + for (sint32 i = 0; i < segIt->imlList.size(); i++) + { + segIt->imlList[i].CheckRegisterUsage(®istersUsed); + sint32 accessedTempReg[5]; + // intermediate FPRs + accessedTempReg[0] = registersUsed.readFPR1; + accessedTempReg[1] = registersUsed.readFPR2; + accessedTempReg[2] = registersUsed.readFPR3; + accessedTempReg[3] = registersUsed.readFPR4; + accessedTempReg[4] = registersUsed.writtenFPR1; + for (sint32 f = 0; f < 5; f++) + { + if (accessedTempReg[f] == -1) + continue; + uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]]; + if (regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0 + 32) + { + segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true; + } + } + } + } + + // merge certain float load+store patterns (must happen before FPR register remapping) + PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext); + // delay byte swapping for certain load+store patterns + PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext); + + if (numLoadedFPRRegisters > 0) + { + if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false) + { + return false; + } + } + + IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext); + + // remove redundant name load and store instructions + PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext); + PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext); + + return true; +} + bool PPCRecompiler_makeRecompiledFunctionActive(uint32 initialEntryPoint, PPCFunctionBoundaryTracker::PPCRange_t& range, PPCRecFunction_t* ppcRecFunc, std::vector>& entryPoints) { // update jump table @@ -511,42 +591,6 @@ void PPCRecompiler_init() PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize()); PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize()); - // init x64 recompiler instance data - ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL; - ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL; - ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL; - ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL; - ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL; - ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL; - ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL); - ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL; - ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL); - ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL); - ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31); - ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF; - ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF; - ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF; - ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL; - ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL; - ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0; - ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0; - ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0; - ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0; - ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f; - ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f; - ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f; - ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f; - *(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000; - *(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000; - ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000; - // setup GQR scale tables for (uint32 i = 0; i < 32; i++) diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h index 88bd1d94..e943d8d3 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h @@ -25,84 +25,6 @@ struct PPCRecFunction_t }; #include "Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h" - -typedef struct _ppcRecompilerSegmentPoint_t -{ - sint32 index; - struct IMLSegment* imlSegment; - _ppcRecompilerSegmentPoint_t* next; - _ppcRecompilerSegmentPoint_t* prev; -}ppcRecompilerSegmentPoint_t; - -struct raLivenessLocation_t -{ - sint32 index; - bool isRead; - bool isWrite; - - raLivenessLocation_t() = default; - - raLivenessLocation_t(sint32 index, bool isRead, bool isWrite) - : index(index), isRead(isRead), isWrite(isWrite) {}; -}; - -struct raLivenessSubrangeLink_t -{ - struct raLivenessSubrange_t* prev; - struct raLivenessSubrange_t* next; -}; - -struct raLivenessSubrange_t -{ - struct raLivenessRange_t* range; - IMLSegment* imlSegment; - ppcRecompilerSegmentPoint_t start; - ppcRecompilerSegmentPoint_t end; - // dirty state tracking - bool _noLoad; - bool hasStore; - bool hasStoreDelayed; - // next - raLivenessSubrange_t* subrangeBranchTaken; - raLivenessSubrange_t* subrangeBranchNotTaken; - // processing - uint32 lastIterationIndex; - // instruction locations - std::vector list_locations; - // linked list (subranges with same GPR virtual register) - raLivenessSubrangeLink_t link_sameVirtualRegisterGPR; - // linked list (all subranges for this segment) - raLivenessSubrangeLink_t link_segmentSubrangesGPR; -}; - -struct raLivenessRange_t -{ - sint32 virtualRegister; - sint32 physicalRegister; - sint32 name; - std::vector list_subranges; -}; - -struct PPCSegmentRegisterAllocatorInfo_t -{ - // analyzer stage - bool isPartOfProcessedLoop{}; // used during loop detection - sint32 lastIterationIndex{}; - // linked lists - raLivenessSubrange_t* linkedList_allSubranges{}; - raLivenessSubrange_t* linkedList_perVirtualGPR[PPC_REC_MAX_VIRTUAL_GPR]{}; -}; - -struct PPCRecVGPRDistances_t -{ - struct _RegArrayEntry - { - sint32 usageStart{}; - sint32 usageEnd{}; - }reg[PPC_REC_MAX_VIRTUAL_GPR]; - bool isProcessed[PPC_REC_MAX_VIRTUAL_GPR]{}; -}; - #include "Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h" struct IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(struct ppcImlGenContext_t* ppcImlGenContext); @@ -140,6 +62,21 @@ struct ppcImlGenContext_t bool modifiesGQR[8]; }tracking; + ~ppcImlGenContext_t() + { + if (imlList) + { + free(imlList); + imlList = nullptr; + } + + for (IMLSegment* imlSegment : segmentList2) + { + delete imlSegment; + } + segmentList2.clear(); + } + // append raw instruction IMLInstruction& emitInst() { @@ -194,8 +131,6 @@ extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)(); #define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1) -// todo - move some of the stuff above into PPCRecompilerInternal.h - // recompiler interface void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerIml.h b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerIml.h index 1db1963f..3b8783f5 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerIml.h +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerIml.h @@ -2,7 +2,6 @@ #define PPCREC_CR_REG_TEMP 8 // there are only 8 cr registers (0-7) we use the 8th as temporary cr register that is never stored (BDNZ instruction for example) bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext, PPCRecFunction_t* PPCRecFunction, std::set& entryAddresses); -void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext); // todo - move to destructor IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(ppcImlGenContext_t* ppcImlGenContext); void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, sint32 shiftBackCount); @@ -10,8 +9,8 @@ IMLInstruction* PPCRecompiler_insertInstruction(IMLSegment* imlSegment, sint32 i void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint32 index, sint32 count); -void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, IMLSegment* imlSegment, sint32 index); -void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint); +void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index); +void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint); // GPR register management uint32 PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp index b5897032..a1cb6f2e 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp @@ -2933,7 +2933,7 @@ uint32 PPCRecompiler_getPreviousInstruction(ppcImlGenContext_t* ppcImlGenContext return v; } -void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, IMLSegment* imlSegment, sint32 index) +void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index) { segmentPoint->imlSegment = imlSegment; segmentPoint->index = index; @@ -2944,7 +2944,7 @@ void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, imlSegment->segmentPointList = segmentPoint; } -void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint) +void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint) { if (segmentPoint->prev) segmentPoint->prev->next = segmentPoint->next; @@ -2975,7 +2975,7 @@ void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, // update position of segment points if (imlSegment->segmentPointList) { - ppcRecompilerSegmentPoint_t* segmentPoint = imlSegment->segmentPointList; + IMLSegmentPoint* segmentPoint = imlSegment->segmentPointList; while (segmentPoint) { if (segmentPoint->index != RA_INTER_RANGE_START && segmentPoint->index != RA_INTER_RANGE_END) @@ -3017,21 +3017,6 @@ void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint3 ppcImlGenContext->segmentList2[index + i] = new IMLSegment(); } -void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext) -{ - if (ppcImlGenContext->imlList) - { - free(ppcImlGenContext->imlList); - ppcImlGenContext->imlList = nullptr; - } - - for (IMLSegment* imlSegment : ppcImlGenContext->segmentList2) - { - delete imlSegment; - } - ppcImlGenContext->segmentList2.clear(); -} - bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext) { bool unsupportedInstructionFound = false; @@ -3953,9 +3938,7 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext ppcImlGenContext.ppcAddressOfCurrentInstruction = 0; // reset current instruction offset (any future generated IML instruction will be assigned to ppc address 0) if( unsupportedInstructionCount > 0 || unsupportedInstructionFound ) { - // could not compile function debug_printf("Failed recompile due to unknown instruction at 0x%08x\n", unsupportedInstructionLastOffset); - PPCRecompiler_freeContext(&ppcImlGenContext); return false; } // optimize unused jumpmarks away @@ -4260,16 +4243,20 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext segIt->imlList[0].op_macro.param = cycleCount; } } + return true; +} +void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext) +{ // find segments that have a (conditional) jump instruction that points in reverse direction of code flow // for these segments there is a risk that the recompiler could get trapped in an infinite busy loop. // todo: We should do a loop-detection prepass where we flag segments that are actually in a loop. We can then use this information below to avoid generating the scheduler-exit code for segments that aren't actually in a loop despite them referencing an earlier segment (which could be an exit segment for example) uint32 currentLoopEscapeJumpMarker = 0xFF000000; // start in an area where no valid code can be located - for(size_t s=0; sppcAddrMin which isn't really reliable. (We already had a problem where function inlining would generate falsified segment ranges by omitting the branch instruction). Find a better solution (use jumpmark/enterable offsets?) IMLSegment* imlSegment = ppcImlGenContext.segmentList2[s]; - if( imlSegment->imlList.empty() ) + if (imlSegment->imlList.empty()) continue; if (imlSegment->imlList[imlSegment->imlList.size() - 1].type != PPCREC_IML_TYPE_CJUMP || imlSegment->imlList[imlSegment->imlList.size() - 1].op_conditionalJump.jumpmarkAddress > imlSegment->ppcAddrMin) continue; @@ -4289,12 +4276,12 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext PPCRecompilerIml_insertSegments(&ppcImlGenContext, s, 2); imlSegment = NULL; - IMLSegment* imlSegmentP0 = ppcImlGenContext.segmentList2[s+0]; - IMLSegment* imlSegmentP1 = ppcImlGenContext.segmentList2[s+1]; - IMLSegment* imlSegmentP2 = ppcImlGenContext.segmentList2[s+2]; + IMLSegment* imlSegmentP0 = ppcImlGenContext.segmentList2[s + 0]; + IMLSegment* imlSegmentP1 = ppcImlGenContext.segmentList2[s + 1]; + IMLSegment* imlSegmentP2 = ppcImlGenContext.segmentList2[s + 2]; // create entry point segment PPCRecompilerIml_insertSegments(&ppcImlGenContext, ppcImlGenContext.segmentList2.size(), 1); - IMLSegment* imlSegmentPEntry = ppcImlGenContext.segmentList2[ppcImlGenContext.segmentList2.size()-1]; + IMLSegment* imlSegmentPEntry = ppcImlGenContext.segmentList2[ppcImlGenContext.segmentList2.size() - 1]; // relink segments IMLSegment_RelinkInputSegment(imlSegmentP2, imlSegmentP0); IMLSegment_SetLinkBranchNotTaken(imlSegmentP0, imlSegmentP1); @@ -4322,7 +4309,7 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext imlSegmentP2->ppcAddrMin = 0; imlSegmentP2->ppcAddrMax = 0; // setup enterable segment - if( enterPPCAddress != 0 && enterPPCAddress != 0xFFFFFFFF ) + if (enterPPCAddress != 0 && enterPPCAddress != 0xFFFFFFFF) { imlSegmentPEntry->isEnterable = true; imlSegmentPEntry->ppcAddress = enterPPCAddress; @@ -4353,70 +4340,4 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext // skip the newly created segments s += 2; } - - // isolate entry points from function flow (enterable segments must not be the target of any other segment) - // this simplifies logic during register allocation - PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext); - - // if GQRs can be predicted, optimize PSQ load/stores - PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext); - - // count number of used registers - uint32 numLoadedFPRRegisters = 0; - for(uint32 i=0; i<255; i++) - { - if( ppcImlGenContext.mappedFPRRegister[i] ) - numLoadedFPRRegisters++; - } - - // insert name store instructions at the end of each segment but before branch instructions - for (IMLSegment* segIt : ppcImlGenContext.segmentList2) - { - if(segIt->imlList.size() == 0 ) - continue; // ignore empty segments - // analyze segment for register usage - IMLUsedRegisters registersUsed; - for(sint32 i=0; iimlList.size(); i++) - { - segIt->imlList[i].CheckRegisterUsage(®istersUsed); - sint32 accessedTempReg[5]; - // intermediate FPRs - accessedTempReg[0] = registersUsed.readFPR1; - accessedTempReg[1] = registersUsed.readFPR2; - accessedTempReg[2] = registersUsed.readFPR3; - accessedTempReg[3] = registersUsed.readFPR4; - accessedTempReg[4] = registersUsed.writtenFPR1; - for(sint32 f=0; f<5; f++) - { - if( accessedTempReg[f] == -1 ) - continue; - uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]]; - if( regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0+32 ) - { - segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true; - } - } - } - } - - // merge certain float load+store patterns (must happen before FPR register remapping) - PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext); - // delay byte swapping for certain load+store patterns - PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext); - - if (numLoadedFPRRegisters > 0) - { - if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false) - { - PPCRecompiler_freeContext(&ppcImlGenContext); - return false; - } - } - - IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext); - - // remove redundant name load and store instructions - PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext); - PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext); - return true; -} +} \ No newline at end of file diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp index 7d14dbf7..58def064 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.cpp @@ -8,6 +8,11 @@ #include "util/MemMapper/MemMapper.h" #include "Common/cpu_features.h" +bool s_hasLZCNTSupport = false; +bool s_hasMOVBESupport = false; +bool s_hasBMI2Support = false; +bool s_hasAVXSupport = false; + sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping { REG_RAX, REG_RDX, REG_RBX, REG_RBP, REG_RSI, REG_RDI, REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_RCX @@ -351,152 +356,143 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p sint32 realRegisterMem2 = PPC_REC_INVALID_REGISTER; if( indexed ) realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2); - if( false )//imlInstruction->op_storeLoad.flags & PPCREC_IML_OP_FLAG_FASTMEMACCESS ) + if( indexed && realRegisterMem == realRegisterMem2 ) { - // load u8/u16/u32 via direct memory access + optional sign extend - assert_dbg(); // todo + return false; } - else + if( indexed && realRegisterData == realRegisterMem2 ) { - if( indexed && realRegisterMem == realRegisterMem2 ) - { - return false; - } - if( indexed && realRegisterData == realRegisterMem2 ) - { - // for indexed memory access realRegisterData must not be the same register as the second memory register, - // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2 - sint32 temp = realRegisterMem; - realRegisterMem = realRegisterMem2; - realRegisterMem2 = temp; - } + // for indexed memory access realRegisterData must not be the same register as the second memory register, + // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2 + sint32 temp = realRegisterMem; + realRegisterMem = realRegisterMem2; + realRegisterMem2 = temp; + } - bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; - bool switchEndian = imlInstruction->op_storeLoad.flags2.swapEndian; - if( imlInstruction->op_storeLoad.copyWidth == 32 ) + bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; + bool switchEndian = imlInstruction->op_storeLoad.flags2.swapEndian; + if( imlInstruction->op_storeLoad.copyWidth == 32 ) + { + //if( indexed ) + // PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + if (indexed) + { + x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2); + } + if( IMLBackendX64_HasExtensionMOVBE() && switchEndian ) { - //if( indexed ) - // PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); if (indexed) { - x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2); - } - if( g_CPUFeatures.x86.movbe && switchEndian ) - { - if (indexed) - { - x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); - //if (indexed && realRegisterMem != realRegisterData) - // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - else - { - x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - } + x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); + //if (indexed && realRegisterMem != realRegisterData) + // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else - { - if (indexed) - { - x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); - //if (realRegisterMem != realRegisterData) - // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - if (switchEndian) - x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); - } - else - { - x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if (switchEndian) - x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); - } - } - } - else if( imlInstruction->op_storeLoad.copyWidth == 16 ) - { - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); // todo: We can avoid this if MOVBE is available - if (indexed) - { - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - if( g_CPUFeatures.x86.movbe && switchEndian ) - { - x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if( indexed && realRegisterMem != realRegisterData ) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - else - { - x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if( indexed && realRegisterMem != realRegisterData ) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - if( switchEndian ) - x64Gen_rol_reg64Low16_imm8(x64GenContext, realRegisterData, 8); - } - if( signExtend ) - x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData); - else - x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData); - } - else if( imlInstruction->op_storeLoad.copyWidth == 8 ) - { - if( indexed ) - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - // todo: Optimize by using only MOVZX/MOVSX - if( indexed ) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - // todo: Use sign extend move from memory instead of separate sign-extend? - if( signExtend ) - x64Gen_movSignExtend_reg64Low32_mem8Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - else - x64Emit_movZX_reg32_mem8(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if( indexed && realRegisterMem != realRegisterData ) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_LOAD_LWARX_MARKER ) - { - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - if( imlInstruction->op_storeLoad.immS32 != 0 ) - assert_dbg(); // not supported - if( indexed ) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), realRegisterMem); // remember EA for reservation - x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if( indexed && realRegisterMem != realRegisterData ) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - if( switchEndian ) - x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); - x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), realRegisterData); // remember value for reservation - // LWARX instruction costs extra cycles (this speeds up busy loops) - x64Gen_sub_mem32reg64_imm32(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), 20); - } - else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_LSWI_3 ) - { - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - if( switchEndian == false ) - assert_dbg(); - if( indexed ) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move) - if( g_CPUFeatures.x86.movbe ) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if( indexed && realRegisterMem != realRegisterData ) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + } + else + { + if (indexed) + { + x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); + //if (realRegisterMem != realRegisterData) + // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + if (switchEndian) + x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); } else { x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); - if( indexed && realRegisterMem != realRegisterData ) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); + if (switchEndian) + x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); } - x64Gen_and_reg64Low32_imm32(x64GenContext, realRegisterData, 0xFFFFFF00); + } + } + else if( imlInstruction->op_storeLoad.copyWidth == 16 ) + { + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); // todo: We can avoid this if MOVBE is available + if (indexed) + { + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + if(IMLBackendX64_HasExtensionMOVBE() && switchEndian ) + { + x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + if( indexed && realRegisterMem != realRegisterData ) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else - return false; - return true; + { + x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + if( indexed && realRegisterMem != realRegisterData ) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + if( switchEndian ) + x64Gen_rol_reg64Low16_imm8(x64GenContext, realRegisterData, 8); + } + if( signExtend ) + x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData); + else + x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData); } - return false; + else if( imlInstruction->op_storeLoad.copyWidth == 8 ) + { + if( indexed ) + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + // todo: Optimize by using only MOVZX/MOVSX + if( indexed ) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + // todo: Use sign extend move from memory instead of separate sign-extend? + if( signExtend ) + x64Gen_movSignExtend_reg64Low32_mem8Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + else + x64Emit_movZX_reg32_mem8(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + if( indexed && realRegisterMem != realRegisterData ) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_LOAD_LWARX_MARKER ) + { + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + if( imlInstruction->op_storeLoad.immS32 != 0 ) + assert_dbg(); // not supported + if( indexed ) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), realRegisterMem); // remember EA for reservation + x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + if( indexed && realRegisterMem != realRegisterData ) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + if( switchEndian ) + x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); + x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), realRegisterData); // remember value for reservation + // LWARX instruction costs extra cycles (this speeds up busy loops) + x64Gen_sub_mem32reg64_imm32(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), 20); + } + else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_LSWI_3 ) + { + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + if( switchEndian == false ) + assert_dbg(); + if( indexed ) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move) + if(IMLBackendX64_HasExtensionMOVBE()) + { + x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + if( indexed && realRegisterMem != realRegisterData ) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + else + { + x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); + if( indexed && realRegisterMem != realRegisterData ) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); + } + x64Gen_and_reg64Low32_imm32(x64GenContext, realRegisterData, 0xFFFFFF00); + } + else + return false; + return true; } /* @@ -510,169 +506,160 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction, if (indexed) realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2); - if (false)//imlInstruction->op_storeLoad.flags & PPCREC_IML_OP_FLAG_FASTMEMACCESS ) + if (indexed && realRegisterMem == realRegisterMem2) { - // load u8/u16/u32 via direct memory access + optional sign extend - assert_dbg(); // todo + return false; } - else + if (indexed && realRegisterData == realRegisterMem2) { - if (indexed && realRegisterMem == realRegisterMem2) - { - return false; - } - if (indexed && realRegisterData == realRegisterMem2) - { - // for indexed memory access realRegisterData must not be the same register as the second memory register, - // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2 - sint32 temp = realRegisterMem; - realRegisterMem = realRegisterMem2; - realRegisterMem2 = temp; - } + // for indexed memory access realRegisterData must not be the same register as the second memory register, + // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2 + sint32 temp = realRegisterMem; + realRegisterMem = realRegisterMem2; + realRegisterMem2 = temp; + } - bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; - bool swapEndian = imlInstruction->op_storeLoad.flags2.swapEndian; - if (imlInstruction->op_storeLoad.copyWidth == 32) - { - if (indexed) - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - uint32 valueRegister; - if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData) - { - valueRegister = realRegisterData; - } - else - { - x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); - valueRegister = REG_RESV_TEMP; - } - if (g_CPUFeatures.x86.movbe == false && swapEndian) - x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister); - if (indexed) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - if (g_CPUFeatures.x86.movbe && swapEndian) - x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); - else - x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); - if (indexed) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - else if (imlInstruction->op_storeLoad.copyWidth == 16) - { - if (indexed || swapEndian) - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); - if (swapEndian) - x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); - if (indexed) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); - if (indexed) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - // todo: Optimize this, e.g. by using MOVBE - } - else if (imlInstruction->op_storeLoad.copyWidth == 8) - { - if (indexed) - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - if (indexed && realRegisterMem == realRegisterData) - { - x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); - realRegisterData = REG_RESV_TEMP; - } - if (indexed) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, realRegisterData); - if (indexed) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STWCX_MARKER) - { + bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; + bool swapEndian = imlInstruction->op_storeLoad.flags2.swapEndian; + if (imlInstruction->op_storeLoad.copyWidth == 32) + { + if (indexed) PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - if (imlInstruction->op_storeLoad.immS32 != 0) - assert_dbg(); // todo - // reset cr0 LT, GT and EQ - sint32 crRegister = 0; - x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_LT), 0); - x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_GT), 0); - x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ), 0); - // calculate effective address - x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); - if (swapEndian) - x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); - if (indexed) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - // realRegisterMem now holds EA - x64Gen_cmp_reg64Low32_mem32reg64(x64GenContext, realRegisterMem, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemAddr)); - sint32 jumpInstructionOffsetJumpToEnd = x64GenContext->codeBufferIndex; - x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NOT_EQUAL, 0); - // EA matches reservation - // backup EAX (since it's an explicit operand of CMPXCHG and will be overwritten) - x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]), REG_EAX); - // backup REG_RESV_MEMBASE - x64Emit_mov_mem64_reg64(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2]), REG_RESV_MEMBASE); - // add mem register to REG_RESV_MEMBASE - x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem); - // load reserved value in EAX - x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemValue)); - // bswap EAX - x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_EAX); - - //x64Gen_lock_cmpxchg_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, 0, REG_RESV_TEMP); - x64Gen_lock_cmpxchg_mem32Reg64_reg64(x64GenContext, REG_RESV_MEMBASE, 0, REG_RESV_TEMP); - - x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_EQUAL, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ)); - - // reset reservation - x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), 0); - x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), 0); - - // restore EAX - x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0])); - // restore REG_RESV_MEMBASE - x64Emit_mov_reg64_mem64(x64GenContext, REG_RESV_MEMBASE, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2])); - - // copy XER SO to CR0 SO - x64Gen_bt_mem8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, spr.XER), 31); - x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_CARRY, REG_RESV_HCPU, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_SO)); - // end - PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffsetJumpToEnd, x64GenContext->codeBufferIndex); - } - else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_2) + uint32 valueRegister; + if ((swapEndian == false || IMLBackendX64_HasExtensionMOVBE()) && realRegisterMem != realRegisterData) { - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); - x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 16); // store upper 2 bytes .. - x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); // .. as big-endian - if (indexed) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - - x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); - if (indexed) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - } - else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_3) - { - PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); - x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); - if (indexed) - x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); - - x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8); - x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 2, REG_RESV_TEMP); - x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8); - x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 1, REG_RESV_TEMP); - x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8); - x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 0, REG_RESV_TEMP); - - if (indexed) - x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + valueRegister = realRegisterData; } else - return false; - return true; + { + x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); + valueRegister = REG_RESV_TEMP; + } + if (!IMLBackendX64_HasExtensionMOVBE() && swapEndian) + x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister); + if (indexed) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + if (IMLBackendX64_HasExtensionMOVBE() && swapEndian) + x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); + else + x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); + if (indexed) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } - return false; + else if (imlInstruction->op_storeLoad.copyWidth == 16) + { + if (indexed || swapEndian) + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); + if (swapEndian) + x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); + if (indexed) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); + if (indexed) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + // todo: Optimize this, e.g. by using MOVBE + } + else if (imlInstruction->op_storeLoad.copyWidth == 8) + { + if (indexed) + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + if (indexed && realRegisterMem == realRegisterData) + { + x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); + realRegisterData = REG_RESV_TEMP; + } + if (indexed) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, realRegisterData); + if (indexed) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STWCX_MARKER) + { + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + if (imlInstruction->op_storeLoad.immS32 != 0) + assert_dbg(); // todo + // reset cr0 LT, GT and EQ + sint32 crRegister = 0; + x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_LT), 0); + x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_GT), 0); + x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ), 0); + // calculate effective address + x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); + if (swapEndian) + x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); + if (indexed) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + // realRegisterMem now holds EA + x64Gen_cmp_reg64Low32_mem32reg64(x64GenContext, realRegisterMem, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemAddr)); + sint32 jumpInstructionOffsetJumpToEnd = x64GenContext->codeBufferIndex; + x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NOT_EQUAL, 0); + // EA matches reservation + // backup EAX (since it's an explicit operand of CMPXCHG and will be overwritten) + x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]), REG_EAX); + // backup REG_RESV_MEMBASE + x64Emit_mov_mem64_reg64(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2]), REG_RESV_MEMBASE); + // add mem register to REG_RESV_MEMBASE + x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem); + // load reserved value in EAX + x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemValue)); + // bswap EAX + x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_EAX); + + //x64Gen_lock_cmpxchg_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, 0, REG_RESV_TEMP); + x64Gen_lock_cmpxchg_mem32Reg64_reg64(x64GenContext, REG_RESV_MEMBASE, 0, REG_RESV_TEMP); + + x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_EQUAL, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ)); + + // reset reservation + x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), 0); + x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), 0); + + // restore EAX + x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0])); + // restore REG_RESV_MEMBASE + x64Emit_mov_reg64_mem64(x64GenContext, REG_RESV_MEMBASE, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2])); + + // copy XER SO to CR0 SO + x64Gen_bt_mem8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, spr.XER), 31); + x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_CARRY, REG_RESV_HCPU, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_SO)); + // end + PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffsetJumpToEnd, x64GenContext->codeBufferIndex); + } + else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_2) + { + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); + x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 16); // store upper 2 bytes .. + x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); // .. as big-endian + if (indexed) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + + x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); + if (indexed) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_3) + { + PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); + x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); + if (indexed) + x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + + x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8); + x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 2, REG_RESV_TEMP); + x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8); + x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 1, REG_RESV_TEMP); + x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8); + x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 0, REG_RESV_TEMP); + + if (indexed) + x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); + } + else + return false; + return true; } bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) @@ -781,7 +768,8 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp // count leading zeros PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER); - if( g_CPUFeatures.x86.lzcnt ) + // LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5]) + if(IMLBackendX64_HasExtensionLZCNT()) { x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA)); } @@ -1499,12 +1487,12 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction, sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA); sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB); - if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW) + if (IMLBackendX64_HasExtensionBMI2() && imlInstruction->operation == PPCREC_IML_OP_SRW) { // use BMI2 SHRX if available x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } - else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW) + else if (IMLBackendX64_HasExtensionBMI2() && imlInstruction->operation == PPCREC_IML_OP_SLW) { // use BMI2 SHLX if available x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); @@ -2656,4 +2644,79 @@ void PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions() PPCRecompiler_leaveRecompilerCode_unvisited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode(); PPCRecompiler_leaveRecompilerCode_visited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode(); cemu_assert_debug(PPCRecompiler_leaveRecompilerCode_unvisited != PPCRecompiler_leaveRecompilerCode_visited); -} \ No newline at end of file +} + +bool IMLBackendX64_HasExtensionLZCNT() +{ + return s_hasLZCNTSupport; +} + +bool IMLBackendX64_HasExtensionMOVBE() +{ + return s_hasMOVBESupport; +} + +bool IMLBackendX64_HasExtensionBMI2() +{ + return s_hasBMI2Support; +} + +bool IMLBackendX64_HasExtensionAVX() +{ + return s_hasAVXSupport; +} + +void IMLBackendX64_Init() +{ + // init x64 recompiler instance data + ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL; + ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL; + ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL; + ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL; + ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL; + ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL; + ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL); + ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL; + ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL); + ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL); + ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31); + ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF; + ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF; + ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF; + ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL; + ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL; + ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0; + ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0; + ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0; + ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0; + ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f; + ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f; + ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f; + ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f; + *(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000; + *(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000; + ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000; + + // mxcsr + ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000; + ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80; + + // query processor extensions + int cpuInfo[4]; + cpuid(cpuInfo, 0x80000001); + s_hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0; + cpuid(cpuInfo, 0x1); + s_hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0; + s_hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0; + cpuidex(cpuInfo, 0x7, 0); + s_hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0; + + forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", s_hasLZCNTSupport ? "LZCNT " : "", s_hasMOVBESupport ? "MOVBE " : "", s_hasAVXSupport ? "AVX " : ""); +} diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.h b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.h index 3df2b761..ebfc55c9 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.h +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64.h @@ -131,6 +131,12 @@ enum #define PPC_X64_GPR_USABLE_REGISTERS (16-4) #define PPC_X64_FPR_USABLE_REGISTERS (16-1) // Use XMM0 - XMM14, XMM15 is the temp register +void IMLBackendX64_Init(); + +bool IMLBackendX64_HasExtensionLZCNT(); +bool IMLBackendX64_HasExtensionMOVBE(); +bool IMLBackendX64_HasExtensionBMI2(); +bool IMLBackendX64_HasExtensionAVX(); bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp index 618c51a2..b39d31c0 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerX64FPU.cpp @@ -87,7 +87,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, { x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg); - if (g_CPUFeatures.x86.movbe) + if (IMLBackendX64_HasExtensionMOVBE()) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32); } @@ -99,7 +99,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, } else { - if (g_CPUFeatures.x86.movbe) + if (IMLBackendX64_HasExtensionMOVBE()) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32); } @@ -109,7 +109,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); } } - if (g_CPUFeatures.x86.avx) + if (IMLBackendX64_HasExtensionAVX()) { x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP); } @@ -281,29 +281,21 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio { x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem); - if( g_CPUFeatures.x86.movbe ) + if(IMLBackendX64_HasExtensionMOVBE()) x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); else x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); } else { - if( g_CPUFeatures.x86.movbe ) + if(IMLBackendX64_HasExtensionMOVBE()) x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); else x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); } - if( g_CPUFeatures.x86.movbe == false ) + if(IMLBackendX64_HasExtensionMOVBE() == false ) x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); - if( g_CPUFeatures.x86.avx ) - { - x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP); - } - else - { - x64Emit_mov_mem32_reg64(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR), REG_RESV_TEMP); - x64Gen_movddup_xmmReg_memReg64(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - } + x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP); if (imlInstruction->op_storeLoad.flags2.notExpanded) { @@ -317,7 +309,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio } else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 ) { - if( g_CPUFeatures.x86.avx ) + if( IMLBackendX64_HasExtensionAVX() ) { if( indexed ) { @@ -420,23 +412,15 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0) { x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM); - if (g_CPUFeatures.x86.avx) - { - x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); - } - else - { - x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - } - if (g_CPUFeatures.x86.movbe == false) + x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); + if (IMLBackendX64_HasExtensionMOVBE() == false) x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); if (indexed) { cemu_assert_debug(memReg != memRegEx); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx); } - if (g_CPUFeatures.x86.movbe) + if (IMLBackendX64_HasExtensionMOVBE()) x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); else x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); @@ -605,30 +589,14 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti if (imlInstruction->op_storeLoad.flags2.notExpanded) { // value is already in single format - if (g_CPUFeatures.x86.avx) - { - x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); - } - else - { - x64Gen_movsd_memReg64_xmmReg(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - } + x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); } else { x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM); - if (g_CPUFeatures.x86.avx) - { - x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); - } - else - { - x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - } + x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP); } - if( g_CPUFeatures.x86.movbe == false ) + if(IMLBackendX64_HasExtensionMOVBE() == false ) x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); if( indexed ) { @@ -636,7 +604,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti assert_dbg(); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } - if( g_CPUFeatures.x86.movbe ) + if(IMLBackendX64_HasExtensionMOVBE()) x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); else x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); @@ -669,15 +637,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti } else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 ) { - if( g_CPUFeatures.x86.avx ) - { - x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); - } - else - { - x64Gen_movsd_memReg64_xmmReg(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR)); - } + x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); if( indexed ) { @@ -1057,7 +1017,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti { x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB); } - else if (g_CPUFeatures.x86.avx) + else if (IMLBackendX64_HasExtensionAVX()) { x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB); }