PPCRec: Emit x86 movd for non-AVX + more restructuring

This commit is contained in:
Exzap 2022-11-07 03:23:46 +01:00
parent 411a83799c
commit da08eda506
13 changed files with 589 additions and 584 deletions

View file

@ -76,6 +76,7 @@ add_library(CemuCafe
HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp
HW/Espresso/Recompiler/IML/IMLOptimizer.cpp HW/Espresso/Recompiler/IML/IMLOptimizer.cpp
HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp
HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h
HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp
HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h
HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp

View file

@ -16,16 +16,16 @@ void IMLAnalyzer_GetCRTracking(IMLInstruction* imlInstruction, PPCRecCRTracking_
// optimizer passes // optimizer passes
// todo - rename // todo - rename
bool PPCRecompiler_reduceNumberOfFPRRegisters(ppcImlGenContext_t* ppcImlGenContext); bool PPCRecompiler_reduceNumberOfFPRRegisters(struct ppcImlGenContext_t* ppcImlGenContext);
bool PPCRecompiler_manageFPRRegisters(ppcImlGenContext_t* ppcImlGenContext); bool PPCRecompiler_manageFPRRegisters(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_removeRedundantCRUpdates(ppcImlGenContext_t* ppcImlGenContext); void PPCRecompiler_removeRedundantCRUpdates(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext); void PPCRecompiler_optimizeDirectFloatCopies(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectIntegerCopies(ppcImlGenContext_t* ppcImlGenContext); void PPCRecompiler_optimizeDirectIntegerCopies(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizePSQLoadAndStore(ppcImlGenContext_t* ppcImlGenContext); void PPCRecompiler_optimizePSQLoadAndStore(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_reorderConditionModifyInstructions(ppcImlGenContext_t* ppcImlGenContext); void PPCRecompiler_reorderConditionModifyInstructions(struct ppcImlGenContext_t* ppcImlGenContext);
// register allocator // register allocator
void IMLRegisterAllocator_AllocateRegisters(ppcImlGenContext_t* ppcImlGenContext); void IMLRegisterAllocator_AllocateRegisters(struct ppcImlGenContext_t* ppcImlGenContext);
// debug // debug
void IMLDebug_DumpSegment(struct IMLSegment* imlSegment, sint32 segmentIndex, bool printLivenessRangeInfo = false); void IMLDebug_DumpSegment(struct IMLSegment* imlSegment, sint32 segmentIndex, bool printLivenessRangeInfo = false);

View file

@ -4,6 +4,7 @@
#include "IMLRegisterAllocatorRanges.h" #include "IMLRegisterAllocatorRanges.h"
#include "util/helpers/StringBuf.h" #include "util/helpers/StringBuf.h"
#include "../PPCRecompiler.h"
const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml) const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml)
{ {

View file

@ -750,8 +750,8 @@ void _analyzeRangeDataFlow(raLivenessSubrange_t* subrange)
void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{ {
sint16 virtualReg2PhysReg[PPC_REC_MAX_VIRTUAL_GPR]; sint16 virtualReg2PhysReg[IML_RA_VIRT_REG_COUNT_MAX];
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
virtualReg2PhysReg[i] = -1; virtualReg2PhysReg[i] = -1;
raLiveRangeInfo_t liveInfo; raLiveRangeInfo_t liveInfo;
@ -848,7 +848,7 @@ void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext,
replaceGpr[f] = -1; replaceGpr[f] = -1;
continue; continue;
} }
if (virtualRegister >= PPC_REC_MAX_VIRTUAL_GPR) if (virtualRegister >= IML_RA_VIRT_REG_COUNT_MAX)
assert_dbg(); assert_dbg();
replaceGpr[f] = virtualReg2PhysReg[virtualRegister]; replaceGpr[f] = virtualReg2PhysReg[virtualRegister];
cemu_assert_debug(replaceGpr[f] >= 0); cemu_assert_debug(replaceGpr[f] >= 0);
@ -860,7 +860,7 @@ void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext,
} }
// expire infinite subranges (subranges that cross the segment border) // expire infinite subranges (subranges that cross the segment border)
sint32 storeLoadListLength = 0; sint32 storeLoadListLength = 0;
raLoadStoreInfo_t loadStoreList[PPC_REC_MAX_VIRTUAL_GPR]; raLoadStoreInfo_t loadStoreList[IML_RA_VIRT_REG_COUNT_MAX];
for (sint32 f = 0; f < liveInfo.liveRangesCount; f++) for (sint32 f = 0; f < liveInfo.liveRangesCount; f++)
{ {
raLivenessSubrange_t* liverange = liveInfo.liveRangeList[f]; raLivenessSubrange_t* liverange = liveInfo.liveRangeList[f];
@ -1007,7 +1007,7 @@ bool _isRangeDefined(IMLSegment* imlSegment, sint32 vGPR)
void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{ {
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
{ {
imlSegment->raDistances.reg[i].usageStart = INT_MAX; imlSegment->raDistances.reg[i].usageStart = INT_MAX;
imlSegment->raDistances.reg[i].usageEnd = INT_MIN; imlSegment->raDistances.reg[i].usageEnd = INT_MIN;
@ -1027,7 +1027,7 @@ void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext,
sint32 virtualRegister = gprTracking.gpr[t]; sint32 virtualRegister = gprTracking.gpr[t];
if (virtualRegister < 0) if (virtualRegister < 0)
continue; continue;
cemu_assert_debug(virtualRegister < PPC_REC_MAX_VIRTUAL_GPR); cemu_assert_debug(virtualRegister < IML_RA_VIRT_REG_COUNT_MAX);
imlSegment->raDistances.reg[virtualRegister].usageStart = std::min<sint32>(imlSegment->raDistances.reg[virtualRegister].usageStart, index); // index before/at instruction imlSegment->raDistances.reg[virtualRegister].usageStart = std::min<sint32>(imlSegment->raDistances.reg[virtualRegister].usageStart, index); // index before/at instruction
imlSegment->raDistances.reg[virtualRegister].usageEnd = std::max<sint32>(imlSegment->raDistances.reg[virtualRegister].usageEnd, index + 1); // index after instruction imlSegment->raDistances.reg[virtualRegister].usageEnd = std::max<sint32>(imlSegment->raDistances.reg[virtualRegister].usageEnd, index + 1); // index after instruction
} }
@ -1086,7 +1086,7 @@ raLivenessSubrange_t* PPCRecRA_convertToMappedRanges(ppcImlGenContext_t* ppcImlG
void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{ {
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
{ {
if (_isRangeDefined(imlSegment, i) == false) if (_isRangeDefined(imlSegment, i) == false)
continue; continue;
@ -1096,8 +1096,8 @@ void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext,
PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment, i, range); PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment, i, range);
} }
// create lookup table of ranges // create lookup table of ranges
raLivenessSubrange_t* vGPR2Subrange[PPC_REC_MAX_VIRTUAL_GPR]; raLivenessSubrange_t* vGPR2Subrange[IML_RA_VIRT_REG_COUNT_MAX];
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
{ {
vGPR2Subrange[i] = imlSegment->raInfo.linkedList_perVirtualGPR[i]; vGPR2Subrange[i] = imlSegment->raInfo.linkedList_perVirtualGPR[i];
#ifdef CEMU_DEBUG_ASSERT #ifdef CEMU_DEBUG_ASSERT
@ -1257,7 +1257,7 @@ void PPCRecRA_checkAndTryExtendRange(ppcImlGenContext_t* ppcImlGenContext, IMLSe
void PPCRecRA_mergeCloseRangesForSegmentV2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) void PPCRecRA_mergeCloseRangesForSegmentV2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{ {
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
{ {
if (imlSegment->raDistances.reg[i].usageStart == INT_MAX) if (imlSegment->raDistances.reg[i].usageStart == INT_MAX)
continue; // not used continue; // not used
@ -1334,7 +1334,7 @@ void PPCRecRA_extendRangesOutOfLoopsV2(ppcImlGenContext_t* ppcImlGenContext)
continue; continue;
// extend looping ranges into all exits (this allows the data flow analyzer to move stores out of the loop) // extend looping ranges into all exits (this allows the data flow analyzer to move stores out of the loop)
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
{ {
if (imlSegment->raDistances.reg[i].usageEnd != RA_INTER_RANGE_END) if (imlSegment->raDistances.reg[i].usageEnd != RA_INTER_RANGE_END)
continue; // range not set or does not reach end of segment continue; // range not set or does not reach end of segment

View file

@ -1,7 +1,84 @@
#pragma once #pragma once
#include "IMLInstruction.h" #include "IMLInstruction.h"
#include "Cafe/HW/Espresso/Recompiler/PPCRecompiler.h" // remove once dependency is gone #define IML_RA_VIRT_REG_COUNT_MAX 40 // should match PPC_REC_MAX_VIRTUAL_GPR -> todo: Make this dynamic
struct IMLSegmentPoint
{
sint32 index;
struct IMLSegment* imlSegment;
IMLSegmentPoint* next;
IMLSegmentPoint* prev;
};
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() = default;
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
};
struct raLivenessSubrangeLink_t
{
struct raLivenessSubrange_t* prev;
struct raLivenessSubrange_t* next;
};
struct raLivenessSubrange_t
{
struct raLivenessRange_t* range;
IMLSegment* imlSegment;
IMLSegmentPoint start;
IMLSegmentPoint end;
// dirty state tracking
bool _noLoad;
bool hasStore;
bool hasStoreDelayed;
// next
raLivenessSubrange_t* subrangeBranchTaken;
raLivenessSubrange_t* subrangeBranchNotTaken;
// processing
uint32 lastIterationIndex;
// instruction locations
std::vector<raLivenessLocation_t> list_locations;
// linked list (subranges with same GPR virtual register)
raLivenessSubrangeLink_t link_sameVirtualRegisterGPR;
// linked list (all subranges for this segment)
raLivenessSubrangeLink_t link_segmentSubrangesGPR;
};
struct raLivenessRange_t
{
sint32 virtualRegister;
sint32 physicalRegister;
sint32 name;
std::vector<raLivenessSubrange_t*> list_subranges;
};
struct PPCSegmentRegisterAllocatorInfo_t
{
// analyzer stage
bool isPartOfProcessedLoop{}; // used during loop detection
sint32 lastIterationIndex{};
// linked lists
raLivenessSubrange_t* linkedList_allSubranges{};
raLivenessSubrange_t* linkedList_perVirtualGPR[IML_RA_VIRT_REG_COUNT_MAX]{};
};
struct PPCRecVGPRDistances_t
{
struct _RegArrayEntry
{
sint32 usageStart{};
sint32 usageEnd{};
}reg[IML_RA_VIRT_REG_COUNT_MAX];
bool isProcessed[IML_RA_VIRT_REG_COUNT_MAX]{};
};
struct IMLSegment struct IMLSegment
{ {
@ -39,11 +116,9 @@ struct IMLSegment
PPCRecVGPRDistances_t raDistances{}; PPCRecVGPRDistances_t raDistances{};
bool raRangeExtendProcessed{}; bool raRangeExtendProcessed{};
// segment points // segment points
ppcRecompilerSegmentPoint_t* segmentPointList{}; IMLSegmentPoint* segmentPointList{};
bool HasSuffixInstruction() const; bool HasSuffixInstruction() const;
IMLInstruction* GetLastInstruction(); IMLInstruction* GetLastInstruction();
}; };

View file

@ -14,6 +14,8 @@
#include "util/helpers/helpers.h" #include "util/helpers/helpers.h"
#include "util/MemMapper/MemMapper.h" #include "util/MemMapper/MemMapper.h"
#include "Cafe/HW/Espresso/Recompiler/IML/IML.h"
struct PPCInvalidationRange struct PPCInvalidationRange
{ {
MPTR startAddress; MPTR startAddress;
@ -127,6 +129,7 @@ void PPCRecompiler_attemptEnter(PPCInterpreter_t* hCPU, uint32 enterAddress)
PPCRecompiler_enter(hCPU, funcPtr); PPCRecompiler_enter(hCPU, funcPtr);
} }
} }
bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext);
PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PPCRange_t range, std::set<uint32>& entryAddresses, std::vector<std::pair<MPTR, uint32>>& entryPointsOut) PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PPCRange_t range, std::set<uint32>& entryAddresses, std::vector<std::pair<MPTR, uint32>>& entryPointsOut)
{ {
@ -153,21 +156,27 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t(); PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress; ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length; ppcRecFunc->ppcSize = range.length;
// generate intermediate code // generate intermediate code
ppcImlGenContext_t ppcImlGenContext = { 0 }; ppcImlGenContext_t ppcImlGenContext = { 0 };
bool compiledSuccessfully = PPCRecompiler_generateIntermediateCode(ppcImlGenContext, ppcRecFunc, entryAddresses); bool compiledSuccessfully = PPCRecompiler_generateIntermediateCode(ppcImlGenContext, ppcRecFunc, entryAddresses);
if (compiledSuccessfully == false) if (compiledSuccessfully == false)
{ {
// todo: Free everything
PPCRecompiler_freeContext(&ppcImlGenContext);
delete ppcRecFunc; delete ppcRecFunc;
return NULL; return nullptr;
} }
// apply passes
if (!PPCRecompiler_ApplyIMLPasses(ppcImlGenContext))
{
delete ppcRecFunc;
return nullptr;
}
// emit x64 code // emit x64 code
bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext); bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext);
if (x64GenerationSuccess == false) if (x64GenerationSuccess == false)
{ {
PPCRecompiler_freeContext(&ppcImlGenContext);
return nullptr; return nullptr;
} }
@ -183,11 +192,82 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
entryPointsOut.emplace_back(ppcEnterOffset, x64Offset); entryPointsOut.emplace_back(ppcEnterOffset, x64Offset);
} }
PPCRecompiler_freeContext(&ppcImlGenContext);
return ppcRecFunc; return ppcRecFunc;
} }
void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext);
bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
{
PPCRecompiler_FixLoops(ppcImlGenContext);
// isolate entry points from function flow (enterable segments must not be the target of any other segment)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// count number of used registers
uint32 numLoadedFPRRegisters = 0;
for (uint32 i = 0; i < 255; i++)
{
if (ppcImlGenContext.mappedFPRRegister[i])
numLoadedFPRRegisters++;
}
// insert name store instructions at the end of each segment but before branch instructions
for (IMLSegment* segIt : ppcImlGenContext.segmentList2)
{
if (segIt->imlList.size() == 0)
continue; // ignore empty segments
// analyze segment for register usage
IMLUsedRegisters registersUsed;
for (sint32 i = 0; i < segIt->imlList.size(); i++)
{
segIt->imlList[i].CheckRegisterUsage(&registersUsed);
sint32 accessedTempReg[5];
// intermediate FPRs
accessedTempReg[0] = registersUsed.readFPR1;
accessedTempReg[1] = registersUsed.readFPR2;
accessedTempReg[2] = registersUsed.readFPR3;
accessedTempReg[3] = registersUsed.readFPR4;
accessedTempReg[4] = registersUsed.writtenFPR1;
for (sint32 f = 0; f < 5; f++)
{
if (accessedTempReg[f] == -1)
continue;
uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]];
if (regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0 + 32)
{
segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true;
}
}
}
}
// merge certain float load+store patterns (must happen before FPR register remapping)
PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext);
if (numLoadedFPRRegisters > 0)
{
if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false)
{
return false;
}
}
IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext);
// remove redundant name load and store instructions
PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
return true;
}
bool PPCRecompiler_makeRecompiledFunctionActive(uint32 initialEntryPoint, PPCFunctionBoundaryTracker::PPCRange_t& range, PPCRecFunction_t* ppcRecFunc, std::vector<std::pair<MPTR, uint32>>& entryPoints) bool PPCRecompiler_makeRecompiledFunctionActive(uint32 initialEntryPoint, PPCFunctionBoundaryTracker::PPCRange_t& range, PPCRecFunction_t* ppcRecFunc, std::vector<std::pair<MPTR, uint32>>& entryPoints)
{ {
// update jump table // update jump table
@ -511,42 +591,6 @@ void PPCRecompiler_init()
PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize()); PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize());
PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize()); PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize());
// init x64 recompiler instance data
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000;
// setup GQR scale tables // setup GQR scale tables
for (uint32 i = 0; i < 32; i++) for (uint32 i = 0; i < 32; i++)

View file

@ -25,84 +25,6 @@ struct PPCRecFunction_t
}; };
#include "Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h" #include "Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h"
typedef struct _ppcRecompilerSegmentPoint_t
{
sint32 index;
struct IMLSegment* imlSegment;
_ppcRecompilerSegmentPoint_t* next;
_ppcRecompilerSegmentPoint_t* prev;
}ppcRecompilerSegmentPoint_t;
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() = default;
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
};
struct raLivenessSubrangeLink_t
{
struct raLivenessSubrange_t* prev;
struct raLivenessSubrange_t* next;
};
struct raLivenessSubrange_t
{
struct raLivenessRange_t* range;
IMLSegment* imlSegment;
ppcRecompilerSegmentPoint_t start;
ppcRecompilerSegmentPoint_t end;
// dirty state tracking
bool _noLoad;
bool hasStore;
bool hasStoreDelayed;
// next
raLivenessSubrange_t* subrangeBranchTaken;
raLivenessSubrange_t* subrangeBranchNotTaken;
// processing
uint32 lastIterationIndex;
// instruction locations
std::vector<raLivenessLocation_t> list_locations;
// linked list (subranges with same GPR virtual register)
raLivenessSubrangeLink_t link_sameVirtualRegisterGPR;
// linked list (all subranges for this segment)
raLivenessSubrangeLink_t link_segmentSubrangesGPR;
};
struct raLivenessRange_t
{
sint32 virtualRegister;
sint32 physicalRegister;
sint32 name;
std::vector<raLivenessSubrange_t*> list_subranges;
};
struct PPCSegmentRegisterAllocatorInfo_t
{
// analyzer stage
bool isPartOfProcessedLoop{}; // used during loop detection
sint32 lastIterationIndex{};
// linked lists
raLivenessSubrange_t* linkedList_allSubranges{};
raLivenessSubrange_t* linkedList_perVirtualGPR[PPC_REC_MAX_VIRTUAL_GPR]{};
};
struct PPCRecVGPRDistances_t
{
struct _RegArrayEntry
{
sint32 usageStart{};
sint32 usageEnd{};
}reg[PPC_REC_MAX_VIRTUAL_GPR];
bool isProcessed[PPC_REC_MAX_VIRTUAL_GPR]{};
};
#include "Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h" #include "Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h"
struct IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(struct ppcImlGenContext_t* ppcImlGenContext); struct IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(struct ppcImlGenContext_t* ppcImlGenContext);
@ -140,6 +62,21 @@ struct ppcImlGenContext_t
bool modifiesGQR[8]; bool modifiesGQR[8];
}tracking; }tracking;
~ppcImlGenContext_t()
{
if (imlList)
{
free(imlList);
imlList = nullptr;
}
for (IMLSegment* imlSegment : segmentList2)
{
delete imlSegment;
}
segmentList2.clear();
}
// append raw instruction // append raw instruction
IMLInstruction& emitInst() IMLInstruction& emitInst()
{ {
@ -194,8 +131,6 @@ extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
#define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1) #define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1)
// todo - move some of the stuff above into PPCRecompilerInternal.h
// recompiler interface // recompiler interface
void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress); void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress);

View file

@ -2,7 +2,6 @@
#define PPCREC_CR_REG_TEMP 8 // there are only 8 cr registers (0-7) we use the 8th as temporary cr register that is never stored (BDNZ instruction for example) #define PPCREC_CR_REG_TEMP 8 // there are only 8 cr registers (0-7) we use the 8th as temporary cr register that is never stored (BDNZ instruction for example)
bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext, PPCRecFunction_t* PPCRecFunction, std::set<uint32>& entryAddresses); bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext, PPCRecFunction_t* PPCRecFunction, std::set<uint32>& entryAddresses);
void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext); // todo - move to destructor
IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(ppcImlGenContext_t* ppcImlGenContext); IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, sint32 shiftBackCount); void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, sint32 shiftBackCount);
@ -10,8 +9,8 @@ IMLInstruction* PPCRecompiler_insertInstruction(IMLSegment* imlSegment, sint32 i
void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint32 index, sint32 count); void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint32 index, sint32 count);
void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, IMLSegment* imlSegment, sint32 index); void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index);
void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint); void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint);
// GPR register management // GPR register management
uint32 PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false); uint32 PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false);

View file

@ -2933,7 +2933,7 @@ uint32 PPCRecompiler_getPreviousInstruction(ppcImlGenContext_t* ppcImlGenContext
return v; return v;
} }
void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, IMLSegment* imlSegment, sint32 index) void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index)
{ {
segmentPoint->imlSegment = imlSegment; segmentPoint->imlSegment = imlSegment;
segmentPoint->index = index; segmentPoint->index = index;
@ -2944,7 +2944,7 @@ void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint,
imlSegment->segmentPointList = segmentPoint; imlSegment->segmentPointList = segmentPoint;
} }
void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint) void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint)
{ {
if (segmentPoint->prev) if (segmentPoint->prev)
segmentPoint->prev->next = segmentPoint->next; segmentPoint->prev->next = segmentPoint->next;
@ -2975,7 +2975,7 @@ void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index,
// update position of segment points // update position of segment points
if (imlSegment->segmentPointList) if (imlSegment->segmentPointList)
{ {
ppcRecompilerSegmentPoint_t* segmentPoint = imlSegment->segmentPointList; IMLSegmentPoint* segmentPoint = imlSegment->segmentPointList;
while (segmentPoint) while (segmentPoint)
{ {
if (segmentPoint->index != RA_INTER_RANGE_START && segmentPoint->index != RA_INTER_RANGE_END) if (segmentPoint->index != RA_INTER_RANGE_START && segmentPoint->index != RA_INTER_RANGE_END)
@ -3017,21 +3017,6 @@ void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint3
ppcImlGenContext->segmentList2[index + i] = new IMLSegment(); ppcImlGenContext->segmentList2[index + i] = new IMLSegment();
} }
void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext)
{
if (ppcImlGenContext->imlList)
{
free(ppcImlGenContext->imlList);
ppcImlGenContext->imlList = nullptr;
}
for (IMLSegment* imlSegment : ppcImlGenContext->segmentList2)
{
delete imlSegment;
}
ppcImlGenContext->segmentList2.clear();
}
bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext) bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
{ {
bool unsupportedInstructionFound = false; bool unsupportedInstructionFound = false;
@ -3953,9 +3938,7 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
ppcImlGenContext.ppcAddressOfCurrentInstruction = 0; // reset current instruction offset (any future generated IML instruction will be assigned to ppc address 0) ppcImlGenContext.ppcAddressOfCurrentInstruction = 0; // reset current instruction offset (any future generated IML instruction will be assigned to ppc address 0)
if( unsupportedInstructionCount > 0 || unsupportedInstructionFound ) if( unsupportedInstructionCount > 0 || unsupportedInstructionFound )
{ {
// could not compile function
debug_printf("Failed recompile due to unknown instruction at 0x%08x\n", unsupportedInstructionLastOffset); debug_printf("Failed recompile due to unknown instruction at 0x%08x\n", unsupportedInstructionLastOffset);
PPCRecompiler_freeContext(&ppcImlGenContext);
return false; return false;
} }
// optimize unused jumpmarks away // optimize unused jumpmarks away
@ -4260,16 +4243,20 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
segIt->imlList[0].op_macro.param = cycleCount; segIt->imlList[0].op_macro.param = cycleCount;
} }
} }
return true;
}
void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext)
{
// find segments that have a (conditional) jump instruction that points in reverse direction of code flow // find segments that have a (conditional) jump instruction that points in reverse direction of code flow
// for these segments there is a risk that the recompiler could get trapped in an infinite busy loop. // for these segments there is a risk that the recompiler could get trapped in an infinite busy loop.
// todo: We should do a loop-detection prepass where we flag segments that are actually in a loop. We can then use this information below to avoid generating the scheduler-exit code for segments that aren't actually in a loop despite them referencing an earlier segment (which could be an exit segment for example) // todo: We should do a loop-detection prepass where we flag segments that are actually in a loop. We can then use this information below to avoid generating the scheduler-exit code for segments that aren't actually in a loop despite them referencing an earlier segment (which could be an exit segment for example)
uint32 currentLoopEscapeJumpMarker = 0xFF000000; // start in an area where no valid code can be located uint32 currentLoopEscapeJumpMarker = 0xFF000000; // start in an area where no valid code can be located
for(size_t s=0; s<ppcImlGenContext.segmentList2.size(); s++) for (size_t s = 0; s < ppcImlGenContext.segmentList2.size(); s++)
{ {
// todo: This currently uses segment->ppcAddrMin which isn't really reliable. (We already had a problem where function inlining would generate falsified segment ranges by omitting the branch instruction). Find a better solution (use jumpmark/enterable offsets?) // todo: This currently uses segment->ppcAddrMin which isn't really reliable. (We already had a problem where function inlining would generate falsified segment ranges by omitting the branch instruction). Find a better solution (use jumpmark/enterable offsets?)
IMLSegment* imlSegment = ppcImlGenContext.segmentList2[s]; IMLSegment* imlSegment = ppcImlGenContext.segmentList2[s];
if( imlSegment->imlList.empty() ) if (imlSegment->imlList.empty())
continue; continue;
if (imlSegment->imlList[imlSegment->imlList.size() - 1].type != PPCREC_IML_TYPE_CJUMP || imlSegment->imlList[imlSegment->imlList.size() - 1].op_conditionalJump.jumpmarkAddress > imlSegment->ppcAddrMin) if (imlSegment->imlList[imlSegment->imlList.size() - 1].type != PPCREC_IML_TYPE_CJUMP || imlSegment->imlList[imlSegment->imlList.size() - 1].op_conditionalJump.jumpmarkAddress > imlSegment->ppcAddrMin)
continue; continue;
@ -4289,12 +4276,12 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
PPCRecompilerIml_insertSegments(&ppcImlGenContext, s, 2); PPCRecompilerIml_insertSegments(&ppcImlGenContext, s, 2);
imlSegment = NULL; imlSegment = NULL;
IMLSegment* imlSegmentP0 = ppcImlGenContext.segmentList2[s+0]; IMLSegment* imlSegmentP0 = ppcImlGenContext.segmentList2[s + 0];
IMLSegment* imlSegmentP1 = ppcImlGenContext.segmentList2[s+1]; IMLSegment* imlSegmentP1 = ppcImlGenContext.segmentList2[s + 1];
IMLSegment* imlSegmentP2 = ppcImlGenContext.segmentList2[s+2]; IMLSegment* imlSegmentP2 = ppcImlGenContext.segmentList2[s + 2];
// create entry point segment // create entry point segment
PPCRecompilerIml_insertSegments(&ppcImlGenContext, ppcImlGenContext.segmentList2.size(), 1); PPCRecompilerIml_insertSegments(&ppcImlGenContext, ppcImlGenContext.segmentList2.size(), 1);
IMLSegment* imlSegmentPEntry = ppcImlGenContext.segmentList2[ppcImlGenContext.segmentList2.size()-1]; IMLSegment* imlSegmentPEntry = ppcImlGenContext.segmentList2[ppcImlGenContext.segmentList2.size() - 1];
// relink segments // relink segments
IMLSegment_RelinkInputSegment(imlSegmentP2, imlSegmentP0); IMLSegment_RelinkInputSegment(imlSegmentP2, imlSegmentP0);
IMLSegment_SetLinkBranchNotTaken(imlSegmentP0, imlSegmentP1); IMLSegment_SetLinkBranchNotTaken(imlSegmentP0, imlSegmentP1);
@ -4322,7 +4309,7 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
imlSegmentP2->ppcAddrMin = 0; imlSegmentP2->ppcAddrMin = 0;
imlSegmentP2->ppcAddrMax = 0; imlSegmentP2->ppcAddrMax = 0;
// setup enterable segment // setup enterable segment
if( enterPPCAddress != 0 && enterPPCAddress != 0xFFFFFFFF ) if (enterPPCAddress != 0 && enterPPCAddress != 0xFFFFFFFF)
{ {
imlSegmentPEntry->isEnterable = true; imlSegmentPEntry->isEnterable = true;
imlSegmentPEntry->ppcAddress = enterPPCAddress; imlSegmentPEntry->ppcAddress = enterPPCAddress;
@ -4353,70 +4340,4 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
// skip the newly created segments // skip the newly created segments
s += 2; s += 2;
} }
}
// isolate entry points from function flow (enterable segments must not be the target of any other segment)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// count number of used registers
uint32 numLoadedFPRRegisters = 0;
for(uint32 i=0; i<255; i++)
{
if( ppcImlGenContext.mappedFPRRegister[i] )
numLoadedFPRRegisters++;
}
// insert name store instructions at the end of each segment but before branch instructions
for (IMLSegment* segIt : ppcImlGenContext.segmentList2)
{
if(segIt->imlList.size() == 0 )
continue; // ignore empty segments
// analyze segment for register usage
IMLUsedRegisters registersUsed;
for(sint32 i=0; i<segIt->imlList.size(); i++)
{
segIt->imlList[i].CheckRegisterUsage(&registersUsed);
sint32 accessedTempReg[5];
// intermediate FPRs
accessedTempReg[0] = registersUsed.readFPR1;
accessedTempReg[1] = registersUsed.readFPR2;
accessedTempReg[2] = registersUsed.readFPR3;
accessedTempReg[3] = registersUsed.readFPR4;
accessedTempReg[4] = registersUsed.writtenFPR1;
for(sint32 f=0; f<5; f++)
{
if( accessedTempReg[f] == -1 )
continue;
uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]];
if( regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0+32 )
{
segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true;
}
}
}
}
// merge certain float load+store patterns (must happen before FPR register remapping)
PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext);
if (numLoadedFPRRegisters > 0)
{
if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false)
{
PPCRecompiler_freeContext(&ppcImlGenContext);
return false;
}
}
IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext);
// remove redundant name load and store instructions
PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
return true;
}

View file

@ -8,6 +8,11 @@
#include "util/MemMapper/MemMapper.h" #include "util/MemMapper/MemMapper.h"
#include "Common/cpu_features.h" #include "Common/cpu_features.h"
bool s_hasLZCNTSupport = false;
bool s_hasMOVBESupport = false;
bool s_hasBMI2Support = false;
bool s_hasAVXSupport = false;
sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping
{ {
REG_RAX, REG_RDX, REG_RBX, REG_RBP, REG_RSI, REG_RDI, REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_RCX REG_RAX, REG_RDX, REG_RBX, REG_RBP, REG_RSI, REG_RDI, REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_RCX
@ -351,152 +356,143 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
sint32 realRegisterMem2 = PPC_REC_INVALID_REGISTER; sint32 realRegisterMem2 = PPC_REC_INVALID_REGISTER;
if( indexed ) if( indexed )
realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2); realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2);
if( false )//imlInstruction->op_storeLoad.flags & PPCREC_IML_OP_FLAG_FASTMEMACCESS ) if( indexed && realRegisterMem == realRegisterMem2 )
{ {
// load u8/u16/u32 via direct memory access + optional sign extend return false;
assert_dbg(); // todo
} }
else if( indexed && realRegisterData == realRegisterMem2 )
{ {
if( indexed && realRegisterMem == realRegisterMem2 ) // for indexed memory access realRegisterData must not be the same register as the second memory register,
{ // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2
return false; sint32 temp = realRegisterMem;
} realRegisterMem = realRegisterMem2;
if( indexed && realRegisterData == realRegisterMem2 ) realRegisterMem2 = temp;
{ }
// for indexed memory access realRegisterData must not be the same register as the second memory register,
// this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2
sint32 temp = realRegisterMem;
realRegisterMem = realRegisterMem2;
realRegisterMem2 = temp;
}
bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend;
bool switchEndian = imlInstruction->op_storeLoad.flags2.swapEndian; bool switchEndian = imlInstruction->op_storeLoad.flags2.swapEndian;
if( imlInstruction->op_storeLoad.copyWidth == 32 ) if( imlInstruction->op_storeLoad.copyWidth == 32 )
{
//if( indexed )
// PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if (indexed)
{
x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2);
}
if( IMLBackendX64_HasExtensionMOVBE() && switchEndian )
{ {
//if( indexed )
// PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if (indexed) if (indexed)
{ {
x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
} //if (indexed && realRegisterMem != realRegisterData)
if( g_CPUFeatures.x86.movbe && switchEndian ) // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
{
if (indexed)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
//if (indexed && realRegisterMem != realRegisterData)
// x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
}
} }
else else
{
if (indexed)
{
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
//if (realRegisterMem != realRegisterData)
// x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (switchEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
}
else
{
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if (switchEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
}
}
}
else if( imlInstruction->op_storeLoad.copyWidth == 16 )
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); // todo: We can avoid this if MOVBE is available
if (indexed)
{
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
if( g_CPUFeatures.x86.movbe && switchEndian )
{
x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else
{
x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if( switchEndian )
x64Gen_rol_reg64Low16_imm8(x64GenContext, realRegisterData, 8);
}
if( signExtend )
x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData);
else
x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData);
}
else if( imlInstruction->op_storeLoad.copyWidth == 8 )
{
if( indexed )
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
// todo: Optimize by using only MOVZX/MOVSX
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
// todo: Use sign extend move from memory instead of separate sign-extend?
if( signExtend )
x64Gen_movSignExtend_reg64Low32_mem8Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
else
x64Emit_movZX_reg32_mem8(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_LOAD_LWARX_MARKER )
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if( imlInstruction->op_storeLoad.immS32 != 0 )
assert_dbg(); // not supported
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), realRegisterMem); // remember EA for reservation
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if( switchEndian )
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), realRegisterData); // remember value for reservation
// LWARX instruction costs extra cycles (this speeds up busy loops)
x64Gen_sub_mem32reg64_imm32(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), 20);
}
else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_LSWI_3 )
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if( switchEndian == false )
assert_dbg();
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move)
if( g_CPUFeatures.x86.movbe )
{ {
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData ) }
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); }
else
{
if (indexed)
{
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
//if (realRegisterMem != realRegisterData)
// x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (switchEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
} }
else else
{ {
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData ) if (switchEndian)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
} }
x64Gen_and_reg64Low32_imm32(x64GenContext, realRegisterData, 0xFFFFFF00); }
}
else if( imlInstruction->op_storeLoad.copyWidth == 16 )
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); // todo: We can avoid this if MOVBE is available
if (indexed)
{
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
if(IMLBackendX64_HasExtensionMOVBE() && switchEndian )
{
x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
} }
else else
return false; {
return true; x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if( switchEndian )
x64Gen_rol_reg64Low16_imm8(x64GenContext, realRegisterData, 8);
}
if( signExtend )
x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData);
else
x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData);
} }
return false; else if( imlInstruction->op_storeLoad.copyWidth == 8 )
{
if( indexed )
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
// todo: Optimize by using only MOVZX/MOVSX
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
// todo: Use sign extend move from memory instead of separate sign-extend?
if( signExtend )
x64Gen_movSignExtend_reg64Low32_mem8Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
else
x64Emit_movZX_reg32_mem8(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_LOAD_LWARX_MARKER )
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if( imlInstruction->op_storeLoad.immS32 != 0 )
assert_dbg(); // not supported
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), realRegisterMem); // remember EA for reservation
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if( switchEndian )
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), realRegisterData); // remember value for reservation
// LWARX instruction costs extra cycles (this speeds up busy loops)
x64Gen_sub_mem32reg64_imm32(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), 20);
}
else if( imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_LSWI_3 )
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if( switchEndian == false )
assert_dbg();
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move)
if(IMLBackendX64_HasExtensionMOVBE())
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else
{
x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData);
}
x64Gen_and_reg64Low32_imm32(x64GenContext, realRegisterData, 0xFFFFFF00);
}
else
return false;
return true;
} }
/* /*
@ -510,169 +506,160 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
if (indexed) if (indexed)
realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2); realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2);
if (false)//imlInstruction->op_storeLoad.flags & PPCREC_IML_OP_FLAG_FASTMEMACCESS ) if (indexed && realRegisterMem == realRegisterMem2)
{ {
// load u8/u16/u32 via direct memory access + optional sign extend return false;
assert_dbg(); // todo
} }
else if (indexed && realRegisterData == realRegisterMem2)
{ {
if (indexed && realRegisterMem == realRegisterMem2) // for indexed memory access realRegisterData must not be the same register as the second memory register,
{ // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2
return false; sint32 temp = realRegisterMem;
} realRegisterMem = realRegisterMem2;
if (indexed && realRegisterData == realRegisterMem2) realRegisterMem2 = temp;
{ }
// for indexed memory access realRegisterData must not be the same register as the second memory register,
// this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2
sint32 temp = realRegisterMem;
realRegisterMem = realRegisterMem2;
realRegisterMem2 = temp;
}
bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend;
bool swapEndian = imlInstruction->op_storeLoad.flags2.swapEndian; bool swapEndian = imlInstruction->op_storeLoad.flags2.swapEndian;
if (imlInstruction->op_storeLoad.copyWidth == 32) if (imlInstruction->op_storeLoad.copyWidth == 32)
{ {
if (indexed) if (indexed)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
uint32 valueRegister;
if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData)
{
valueRegister = realRegisterData;
}
else
{
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
valueRegister = REG_RESV_TEMP;
}
if (g_CPUFeatures.x86.movbe == false && swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (g_CPUFeatures.x86.movbe && swapEndian)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if (imlInstruction->op_storeLoad.copyWidth == 16)
{
if (indexed || swapEndian)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
if (swapEndian)
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
// todo: Optimize this, e.g. by using MOVBE
}
else if (imlInstruction->op_storeLoad.copyWidth == 8)
{
if (indexed)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if (indexed && realRegisterMem == realRegisterData)
{
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
realRegisterData = REG_RESV_TEMP;
}
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, realRegisterData);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STWCX_MARKER)
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if (imlInstruction->op_storeLoad.immS32 != 0) uint32 valueRegister;
assert_dbg(); // todo if ((swapEndian == false || IMLBackendX64_HasExtensionMOVBE()) && realRegisterMem != realRegisterData)
// reset cr0 LT, GT and EQ
sint32 crRegister = 0;
x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_LT), 0);
x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_GT), 0);
x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ), 0);
// calculate effective address
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
if (swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
// realRegisterMem now holds EA
x64Gen_cmp_reg64Low32_mem32reg64(x64GenContext, realRegisterMem, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemAddr));
sint32 jumpInstructionOffsetJumpToEnd = x64GenContext->codeBufferIndex;
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NOT_EQUAL, 0);
// EA matches reservation
// backup EAX (since it's an explicit operand of CMPXCHG and will be overwritten)
x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]), REG_EAX);
// backup REG_RESV_MEMBASE
x64Emit_mov_mem64_reg64(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2]), REG_RESV_MEMBASE);
// add mem register to REG_RESV_MEMBASE
x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem);
// load reserved value in EAX
x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemValue));
// bswap EAX
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_EAX);
//x64Gen_lock_cmpxchg_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, 0, REG_RESV_TEMP);
x64Gen_lock_cmpxchg_mem32Reg64_reg64(x64GenContext, REG_RESV_MEMBASE, 0, REG_RESV_TEMP);
x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_EQUAL, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ));
// reset reservation
x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), 0);
x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), 0);
// restore EAX
x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]));
// restore REG_RESV_MEMBASE
x64Emit_mov_reg64_mem64(x64GenContext, REG_RESV_MEMBASE, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2]));
// copy XER SO to CR0 SO
x64Gen_bt_mem8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, spr.XER), 31);
x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_CARRY, REG_RESV_HCPU, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_SO));
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffsetJumpToEnd, x64GenContext->codeBufferIndex);
}
else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_2)
{ {
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); valueRegister = realRegisterData;
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 16); // store upper 2 bytes ..
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); // .. as big-endian
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_3)
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 2, REG_RESV_TEMP);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 1, REG_RESV_TEMP);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 0, REG_RESV_TEMP);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
} }
else else
return false; {
return true; x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
valueRegister = REG_RESV_TEMP;
}
if (!IMLBackendX64_HasExtensionMOVBE() && swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (IMLBackendX64_HasExtensionMOVBE() && swapEndian)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
} }
return false; else if (imlInstruction->op_storeLoad.copyWidth == 16)
{
if (indexed || swapEndian)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
if (swapEndian)
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
// todo: Optimize this, e.g. by using MOVBE
}
else if (imlInstruction->op_storeLoad.copyWidth == 8)
{
if (indexed)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if (indexed && realRegisterMem == realRegisterData)
{
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
realRegisterData = REG_RESV_TEMP;
}
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, realRegisterData);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STWCX_MARKER)
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
if (imlInstruction->op_storeLoad.immS32 != 0)
assert_dbg(); // todo
// reset cr0 LT, GT and EQ
sint32 crRegister = 0;
x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_LT), 0);
x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_GT), 0);
x64Gen_mov_mem8Reg64_imm8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ), 0);
// calculate effective address
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
if (swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
// realRegisterMem now holds EA
x64Gen_cmp_reg64Low32_mem32reg64(x64GenContext, realRegisterMem, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemAddr));
sint32 jumpInstructionOffsetJumpToEnd = x64GenContext->codeBufferIndex;
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NOT_EQUAL, 0);
// EA matches reservation
// backup EAX (since it's an explicit operand of CMPXCHG and will be overwritten)
x64Emit_mov_mem32_reg32(x64GenContext, REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]), REG_EAX);
// backup REG_RESV_MEMBASE
x64Emit_mov_mem64_reg64(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2]), REG_RESV_MEMBASE);
// add mem register to REG_RESV_MEMBASE
x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem);
// load reserved value in EAX
x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, offsetof(PPCInterpreter_t, reservedMemValue));
// bswap EAX
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_EAX);
//x64Gen_lock_cmpxchg_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, 0, REG_RESV_TEMP);
x64Gen_lock_cmpxchg_mem32Reg64_reg64(x64GenContext, REG_RESV_MEMBASE, 0, REG_RESV_TEMP);
x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_EQUAL, REG_RSP, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_EQ));
// reset reservation
x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemAddr), 0);
x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, reservedMemValue), 0);
// restore EAX
x64Emit_mov_reg64_mem32(x64GenContext, REG_EAX, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]));
// restore REG_RESV_MEMBASE
x64Emit_mov_reg64_mem64(x64GenContext, REG_RESV_MEMBASE, REG_RESV_HCPU, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[2]));
// copy XER SO to CR0 SO
x64Gen_bt_mem8(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, spr.XER), 31);
x64Gen_setcc_mem8(x64GenContext, X86_CONDITION_CARRY, REG_RESV_HCPU, offsetof(PPCInterpreter_t, cr) + sizeof(uint8)*(crRegister * 4 + PPCREC_CR_BIT_SO));
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffsetJumpToEnd, x64GenContext->codeBufferIndex);
}
else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_2)
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 16); // store upper 2 bytes ..
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); // .. as big-endian
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else if (imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STSWI_3)
{
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 2, REG_RESV_TEMP);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 1, REG_RESV_TEMP);
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 8);
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32 + 0, REG_RESV_TEMP);
if (indexed)
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
else
return false;
return true;
} }
bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
@ -781,7 +768,8 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp
// count leading zeros // count leading zeros
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER); cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER);
if( g_CPUFeatures.x86.lzcnt ) // LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5])
if(IMLBackendX64_HasExtensionLZCNT())
{ {
x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA)); x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA));
} }
@ -1499,12 +1487,12 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction,
sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA); sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA);
sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB); sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB);
if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW) if (IMLBackendX64_HasExtensionBMI2() && imlInstruction->operation == PPCREC_IML_OP_SRW)
{ {
// use BMI2 SHRX if available // use BMI2 SHRX if available
x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);
} }
else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW) else if (IMLBackendX64_HasExtensionBMI2() && imlInstruction->operation == PPCREC_IML_OP_SLW)
{ {
// use BMI2 SHLX if available // use BMI2 SHLX if available
x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);
@ -2656,4 +2644,79 @@ void PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions()
PPCRecompiler_leaveRecompilerCode_unvisited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode(); PPCRecompiler_leaveRecompilerCode_unvisited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode();
PPCRecompiler_leaveRecompilerCode_visited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode(); PPCRecompiler_leaveRecompilerCode_visited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode();
cemu_assert_debug(PPCRecompiler_leaveRecompilerCode_unvisited != PPCRecompiler_leaveRecompilerCode_visited); cemu_assert_debug(PPCRecompiler_leaveRecompilerCode_unvisited != PPCRecompiler_leaveRecompilerCode_visited);
} }
bool IMLBackendX64_HasExtensionLZCNT()
{
return s_hasLZCNTSupport;
}
bool IMLBackendX64_HasExtensionMOVBE()
{
return s_hasMOVBESupport;
}
bool IMLBackendX64_HasExtensionBMI2()
{
return s_hasBMI2Support;
}
bool IMLBackendX64_HasExtensionAVX()
{
return s_hasAVXSupport;
}
void IMLBackendX64_Init()
{
// init x64 recompiler instance data
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000;
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
// query processor extensions
int cpuInfo[4];
cpuid(cpuInfo, 0x80000001);
s_hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0;
cpuid(cpuInfo, 0x1);
s_hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0;
s_hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
s_hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0;
forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", s_hasLZCNTSupport ? "LZCNT " : "", s_hasMOVBESupport ? "MOVBE " : "", s_hasAVXSupport ? "AVX " : "");
}

View file

@ -131,6 +131,12 @@ enum
#define PPC_X64_GPR_USABLE_REGISTERS (16-4) #define PPC_X64_GPR_USABLE_REGISTERS (16-4)
#define PPC_X64_FPR_USABLE_REGISTERS (16-1) // Use XMM0 - XMM14, XMM15 is the temp register #define PPC_X64_FPR_USABLE_REGISTERS (16-1) // Use XMM0 - XMM14, XMM15 is the temp register
void IMLBackendX64_Init();
bool IMLBackendX64_HasExtensionLZCNT();
bool IMLBackendX64_HasExtensionMOVBE();
bool IMLBackendX64_HasExtensionBMI2();
bool IMLBackendX64_HasExtensionAVX();
bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext); bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext);

View file

@ -87,7 +87,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
{ {
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx); x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg);
if (g_CPUFeatures.x86.movbe) if (IMLBackendX64_HasExtensionMOVBE())
{ {
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
} }
@ -99,7 +99,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
} }
else else
{ {
if (g_CPUFeatures.x86.movbe) if (IMLBackendX64_HasExtensionMOVBE())
{ {
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
} }
@ -109,7 +109,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
} }
} }
if (g_CPUFeatures.x86.avx) if (IMLBackendX64_HasExtensionAVX())
{ {
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP); x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP);
} }
@ -281,29 +281,21 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
{ {
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2); x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem);
if( g_CPUFeatures.x86.movbe ) if(IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
else else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
} }
else else
{ {
if( g_CPUFeatures.x86.movbe ) if(IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
else else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32); x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
} }
if( g_CPUFeatures.x86.movbe == false ) if(IMLBackendX64_HasExtensionMOVBE() == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( g_CPUFeatures.x86.avx ) x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP);
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP);
}
else
{
x64Emit_mov_mem32_reg64(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR), REG_RESV_TEMP);
x64Gen_movddup_xmmReg_memReg64(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
if (imlInstruction->op_storeLoad.flags2.notExpanded) if (imlInstruction->op_storeLoad.flags2.notExpanded)
{ {
@ -317,7 +309,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
} }
else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 ) else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 )
{ {
if( g_CPUFeatures.x86.avx ) if( IMLBackendX64_HasExtensionAVX() )
{ {
if( indexed ) if( indexed )
{ {
@ -420,23 +412,15 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext
if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0) if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0)
{ {
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM); x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
if (g_CPUFeatures.x86.avx) x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
{ if (IMLBackendX64_HasExtensionMOVBE() == false)
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
if (g_CPUFeatures.x86.movbe == false)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed) if (indexed)
{ {
cemu_assert_debug(memReg != memRegEx); cemu_assert_debug(memReg != memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
} }
if (g_CPUFeatures.x86.movbe) if (IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
else else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP); x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
@ -605,30 +589,14 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
if (imlInstruction->op_storeLoad.flags2.notExpanded) if (imlInstruction->op_storeLoad.flags2.notExpanded)
{ {
// value is already in single format // value is already in single format
if (g_CPUFeatures.x86.avx) x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
} }
else else
{ {
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM); x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM);
if (g_CPUFeatures.x86.avx) x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
} }
if( g_CPUFeatures.x86.movbe == false ) if(IMLBackendX64_HasExtensionMOVBE() == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( indexed ) if( indexed )
{ {
@ -636,7 +604,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
assert_dbg(); assert_dbg();
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
} }
if( g_CPUFeatures.x86.movbe ) if(IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
else else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
@ -669,15 +637,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
} }
else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 ) else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 )
{ {
if( g_CPUFeatures.x86.avx ) x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP); x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( indexed ) if( indexed )
{ {
@ -1057,7 +1017,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
{ {
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB); x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB);
} }
else if (g_CPUFeatures.x86.avx) else if (IMLBackendX64_HasExtensionAVX())
{ {
x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB); x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB);
} }