PPCRec: Emit x86 movd for non-AVX + more restructuring

This commit is contained in:
Exzap 2022-11-07 03:23:46 +01:00
parent 411a83799c
commit da08eda506
13 changed files with 589 additions and 584 deletions

View file

@ -76,6 +76,7 @@ add_library(CemuCafe
HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp
HW/Espresso/Recompiler/IML/IMLOptimizer.cpp
HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp
HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h
HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp
HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h
HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp

View file

@ -16,16 +16,16 @@ void IMLAnalyzer_GetCRTracking(IMLInstruction* imlInstruction, PPCRecCRTracking_
// optimizer passes
// todo - rename
bool PPCRecompiler_reduceNumberOfFPRRegisters(ppcImlGenContext_t* ppcImlGenContext);
bool PPCRecompiler_manageFPRRegisters(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_removeRedundantCRUpdates(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectIntegerCopies(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizePSQLoadAndStore(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_reorderConditionModifyInstructions(ppcImlGenContext_t* ppcImlGenContext);
bool PPCRecompiler_reduceNumberOfFPRRegisters(struct ppcImlGenContext_t* ppcImlGenContext);
bool PPCRecompiler_manageFPRRegisters(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_removeRedundantCRUpdates(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectFloatCopies(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectIntegerCopies(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizePSQLoadAndStore(struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_reorderConditionModifyInstructions(struct ppcImlGenContext_t* ppcImlGenContext);
// register allocator
void IMLRegisterAllocator_AllocateRegisters(ppcImlGenContext_t* ppcImlGenContext);
void IMLRegisterAllocator_AllocateRegisters(struct ppcImlGenContext_t* ppcImlGenContext);
// debug
void IMLDebug_DumpSegment(struct IMLSegment* imlSegment, sint32 segmentIndex, bool printLivenessRangeInfo = false);

View file

@ -4,6 +4,7 @@
#include "IMLRegisterAllocatorRanges.h"
#include "util/helpers/StringBuf.h"
#include "../PPCRecompiler.h"
const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml)
{

View file

@ -750,8 +750,8 @@ void _analyzeRangeDataFlow(raLivenessSubrange_t* subrange)
void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{
sint16 virtualReg2PhysReg[PPC_REC_MAX_VIRTUAL_GPR];
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
sint16 virtualReg2PhysReg[IML_RA_VIRT_REG_COUNT_MAX];
for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
virtualReg2PhysReg[i] = -1;
raLiveRangeInfo_t liveInfo;
@ -848,7 +848,7 @@ void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext,
replaceGpr[f] = -1;
continue;
}
if (virtualRegister >= PPC_REC_MAX_VIRTUAL_GPR)
if (virtualRegister >= IML_RA_VIRT_REG_COUNT_MAX)
assert_dbg();
replaceGpr[f] = virtualReg2PhysReg[virtualRegister];
cemu_assert_debug(replaceGpr[f] >= 0);
@ -860,7 +860,7 @@ void PPCRecRA_generateSegmentInstructions(ppcImlGenContext_t* ppcImlGenContext,
}
// expire infinite subranges (subranges that cross the segment border)
sint32 storeLoadListLength = 0;
raLoadStoreInfo_t loadStoreList[PPC_REC_MAX_VIRTUAL_GPR];
raLoadStoreInfo_t loadStoreList[IML_RA_VIRT_REG_COUNT_MAX];
for (sint32 f = 0; f < liveInfo.liveRangesCount; f++)
{
raLivenessSubrange_t* liverange = liveInfo.liveRangeList[f];
@ -1007,7 +1007,7 @@ bool _isRangeDefined(IMLSegment* imlSegment, sint32 vGPR)
void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
{
imlSegment->raDistances.reg[i].usageStart = INT_MAX;
imlSegment->raDistances.reg[i].usageEnd = INT_MIN;
@ -1027,7 +1027,7 @@ void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext,
sint32 virtualRegister = gprTracking.gpr[t];
if (virtualRegister < 0)
continue;
cemu_assert_debug(virtualRegister < PPC_REC_MAX_VIRTUAL_GPR);
cemu_assert_debug(virtualRegister < IML_RA_VIRT_REG_COUNT_MAX);
imlSegment->raDistances.reg[virtualRegister].usageStart = std::min<sint32>(imlSegment->raDistances.reg[virtualRegister].usageStart, index); // index before/at instruction
imlSegment->raDistances.reg[virtualRegister].usageEnd = std::max<sint32>(imlSegment->raDistances.reg[virtualRegister].usageEnd, index + 1); // index after instruction
}
@ -1086,7 +1086,7 @@ raLivenessSubrange_t* PPCRecRA_convertToMappedRanges(ppcImlGenContext_t* ppcImlG
void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
{
if (_isRangeDefined(imlSegment, i) == false)
continue;
@ -1096,8 +1096,8 @@ void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext,
PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment, i, range);
}
// create lookup table of ranges
raLivenessSubrange_t* vGPR2Subrange[PPC_REC_MAX_VIRTUAL_GPR];
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
raLivenessSubrange_t* vGPR2Subrange[IML_RA_VIRT_REG_COUNT_MAX];
for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++)
{
vGPR2Subrange[i] = imlSegment->raInfo.linkedList_perVirtualGPR[i];
#ifdef CEMU_DEBUG_ASSERT
@ -1257,7 +1257,7 @@ void PPCRecRA_checkAndTryExtendRange(ppcImlGenContext_t* ppcImlGenContext, IMLSe
void PPCRecRA_mergeCloseRangesForSegmentV2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
{
if (imlSegment->raDistances.reg[i].usageStart == INT_MAX)
continue; // not used
@ -1334,7 +1334,7 @@ void PPCRecRA_extendRangesOutOfLoopsV2(ppcImlGenContext_t* ppcImlGenContext)
continue;
// extend looping ranges into all exits (this allows the data flow analyzer to move stores out of the loop)
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
for (sint32 i = 0; i < IML_RA_VIRT_REG_COUNT_MAX; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
{
if (imlSegment->raDistances.reg[i].usageEnd != RA_INTER_RANGE_END)
continue; // range not set or does not reach end of segment

View file

@ -1,7 +1,84 @@
#pragma once
#include "IMLInstruction.h"
#include "Cafe/HW/Espresso/Recompiler/PPCRecompiler.h" // remove once dependency is gone
#define IML_RA_VIRT_REG_COUNT_MAX 40 // should match PPC_REC_MAX_VIRTUAL_GPR -> todo: Make this dynamic
struct IMLSegmentPoint
{
sint32 index;
struct IMLSegment* imlSegment;
IMLSegmentPoint* next;
IMLSegmentPoint* prev;
};
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() = default;
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
};
struct raLivenessSubrangeLink_t
{
struct raLivenessSubrange_t* prev;
struct raLivenessSubrange_t* next;
};
struct raLivenessSubrange_t
{
struct raLivenessRange_t* range;
IMLSegment* imlSegment;
IMLSegmentPoint start;
IMLSegmentPoint end;
// dirty state tracking
bool _noLoad;
bool hasStore;
bool hasStoreDelayed;
// next
raLivenessSubrange_t* subrangeBranchTaken;
raLivenessSubrange_t* subrangeBranchNotTaken;
// processing
uint32 lastIterationIndex;
// instruction locations
std::vector<raLivenessLocation_t> list_locations;
// linked list (subranges with same GPR virtual register)
raLivenessSubrangeLink_t link_sameVirtualRegisterGPR;
// linked list (all subranges for this segment)
raLivenessSubrangeLink_t link_segmentSubrangesGPR;
};
struct raLivenessRange_t
{
sint32 virtualRegister;
sint32 physicalRegister;
sint32 name;
std::vector<raLivenessSubrange_t*> list_subranges;
};
struct PPCSegmentRegisterAllocatorInfo_t
{
// analyzer stage
bool isPartOfProcessedLoop{}; // used during loop detection
sint32 lastIterationIndex{};
// linked lists
raLivenessSubrange_t* linkedList_allSubranges{};
raLivenessSubrange_t* linkedList_perVirtualGPR[IML_RA_VIRT_REG_COUNT_MAX]{};
};
struct PPCRecVGPRDistances_t
{
struct _RegArrayEntry
{
sint32 usageStart{};
sint32 usageEnd{};
}reg[IML_RA_VIRT_REG_COUNT_MAX];
bool isProcessed[IML_RA_VIRT_REG_COUNT_MAX]{};
};
struct IMLSegment
{
@ -39,11 +116,9 @@ struct IMLSegment
PPCRecVGPRDistances_t raDistances{};
bool raRangeExtendProcessed{};
// segment points
ppcRecompilerSegmentPoint_t* segmentPointList{};
IMLSegmentPoint* segmentPointList{};
bool HasSuffixInstruction() const;
IMLInstruction* GetLastInstruction();
};

View file

@ -14,6 +14,8 @@
#include "util/helpers/helpers.h"
#include "util/MemMapper/MemMapper.h"
#include "Cafe/HW/Espresso/Recompiler/IML/IML.h"
struct PPCInvalidationRange
{
MPTR startAddress;
@ -127,6 +129,7 @@ void PPCRecompiler_attemptEnter(PPCInterpreter_t* hCPU, uint32 enterAddress)
PPCRecompiler_enter(hCPU, funcPtr);
}
}
bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext);
PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PPCRange_t range, std::set<uint32>& entryAddresses, std::vector<std::pair<MPTR, uint32>>& entryPointsOut)
{
@ -153,21 +156,27 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length;
// generate intermediate code
ppcImlGenContext_t ppcImlGenContext = { 0 };
bool compiledSuccessfully = PPCRecompiler_generateIntermediateCode(ppcImlGenContext, ppcRecFunc, entryAddresses);
if (compiledSuccessfully == false)
{
// todo: Free everything
PPCRecompiler_freeContext(&ppcImlGenContext);
delete ppcRecFunc;
return NULL;
return nullptr;
}
// apply passes
if (!PPCRecompiler_ApplyIMLPasses(ppcImlGenContext))
{
delete ppcRecFunc;
return nullptr;
}
// emit x64 code
bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext);
if (x64GenerationSuccess == false)
{
PPCRecompiler_freeContext(&ppcImlGenContext);
return nullptr;
}
@ -183,11 +192,82 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
entryPointsOut.emplace_back(ppcEnterOffset, x64Offset);
}
PPCRecompiler_freeContext(&ppcImlGenContext);
return ppcRecFunc;
}
void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext);
bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
{
PPCRecompiler_FixLoops(ppcImlGenContext);
// isolate entry points from function flow (enterable segments must not be the target of any other segment)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// count number of used registers
uint32 numLoadedFPRRegisters = 0;
for (uint32 i = 0; i < 255; i++)
{
if (ppcImlGenContext.mappedFPRRegister[i])
numLoadedFPRRegisters++;
}
// insert name store instructions at the end of each segment but before branch instructions
for (IMLSegment* segIt : ppcImlGenContext.segmentList2)
{
if (segIt->imlList.size() == 0)
continue; // ignore empty segments
// analyze segment for register usage
IMLUsedRegisters registersUsed;
for (sint32 i = 0; i < segIt->imlList.size(); i++)
{
segIt->imlList[i].CheckRegisterUsage(&registersUsed);
sint32 accessedTempReg[5];
// intermediate FPRs
accessedTempReg[0] = registersUsed.readFPR1;
accessedTempReg[1] = registersUsed.readFPR2;
accessedTempReg[2] = registersUsed.readFPR3;
accessedTempReg[3] = registersUsed.readFPR4;
accessedTempReg[4] = registersUsed.writtenFPR1;
for (sint32 f = 0; f < 5; f++)
{
if (accessedTempReg[f] == -1)
continue;
uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]];
if (regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0 + 32)
{
segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true;
}
}
}
}
// merge certain float load+store patterns (must happen before FPR register remapping)
PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext);
if (numLoadedFPRRegisters > 0)
{
if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false)
{
return false;
}
}
IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext);
// remove redundant name load and store instructions
PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
return true;
}
bool PPCRecompiler_makeRecompiledFunctionActive(uint32 initialEntryPoint, PPCFunctionBoundaryTracker::PPCRange_t& range, PPCRecFunction_t* ppcRecFunc, std::vector<std::pair<MPTR, uint32>>& entryPoints)
{
// update jump table
@ -511,42 +591,6 @@ void PPCRecompiler_init()
PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize());
PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize());
// init x64 recompiler instance data
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000;
// setup GQR scale tables
for (uint32 i = 0; i < 32; i++)

View file

@ -25,84 +25,6 @@ struct PPCRecFunction_t
};
#include "Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h"
typedef struct _ppcRecompilerSegmentPoint_t
{
sint32 index;
struct IMLSegment* imlSegment;
_ppcRecompilerSegmentPoint_t* next;
_ppcRecompilerSegmentPoint_t* prev;
}ppcRecompilerSegmentPoint_t;
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() = default;
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
};
struct raLivenessSubrangeLink_t
{
struct raLivenessSubrange_t* prev;
struct raLivenessSubrange_t* next;
};
struct raLivenessSubrange_t
{
struct raLivenessRange_t* range;
IMLSegment* imlSegment;
ppcRecompilerSegmentPoint_t start;
ppcRecompilerSegmentPoint_t end;
// dirty state tracking
bool _noLoad;
bool hasStore;
bool hasStoreDelayed;
// next
raLivenessSubrange_t* subrangeBranchTaken;
raLivenessSubrange_t* subrangeBranchNotTaken;
// processing
uint32 lastIterationIndex;
// instruction locations
std::vector<raLivenessLocation_t> list_locations;
// linked list (subranges with same GPR virtual register)
raLivenessSubrangeLink_t link_sameVirtualRegisterGPR;
// linked list (all subranges for this segment)
raLivenessSubrangeLink_t link_segmentSubrangesGPR;
};
struct raLivenessRange_t
{
sint32 virtualRegister;
sint32 physicalRegister;
sint32 name;
std::vector<raLivenessSubrange_t*> list_subranges;
};
struct PPCSegmentRegisterAllocatorInfo_t
{
// analyzer stage
bool isPartOfProcessedLoop{}; // used during loop detection
sint32 lastIterationIndex{};
// linked lists
raLivenessSubrange_t* linkedList_allSubranges{};
raLivenessSubrange_t* linkedList_perVirtualGPR[PPC_REC_MAX_VIRTUAL_GPR]{};
};
struct PPCRecVGPRDistances_t
{
struct _RegArrayEntry
{
sint32 usageStart{};
sint32 usageEnd{};
}reg[PPC_REC_MAX_VIRTUAL_GPR];
bool isProcessed[PPC_REC_MAX_VIRTUAL_GPR]{};
};
#include "Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h"
struct IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(struct ppcImlGenContext_t* ppcImlGenContext);
@ -140,6 +62,21 @@ struct ppcImlGenContext_t
bool modifiesGQR[8];
}tracking;
~ppcImlGenContext_t()
{
if (imlList)
{
free(imlList);
imlList = nullptr;
}
for (IMLSegment* imlSegment : segmentList2)
{
delete imlSegment;
}
segmentList2.clear();
}
// append raw instruction
IMLInstruction& emitInst()
{
@ -194,8 +131,6 @@ extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
#define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1)
// todo - move some of the stuff above into PPCRecompilerInternal.h
// recompiler interface
void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress);

View file

@ -2,7 +2,6 @@
#define PPCREC_CR_REG_TEMP 8 // there are only 8 cr registers (0-7) we use the 8th as temporary cr register that is never stored (BDNZ instruction for example)
bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext, PPCRecFunction_t* PPCRecFunction, std::set<uint32>& entryAddresses);
void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext); // todo - move to destructor
IMLInstruction* PPCRecompilerImlGen_generateNewEmptyInstruction(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, sint32 shiftBackCount);
@ -10,8 +9,8 @@ IMLInstruction* PPCRecompiler_insertInstruction(IMLSegment* imlSegment, sint32 i
void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint32 index, sint32 count);
void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, IMLSegment* imlSegment, sint32 index);
void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint);
void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index);
void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint);
// GPR register management
uint32 PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false);

View file

@ -2933,7 +2933,7 @@ uint32 PPCRecompiler_getPreviousInstruction(ppcImlGenContext_t* ppcImlGenContext
return v;
}
void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, IMLSegment* imlSegment, sint32 index)
void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index)
{
segmentPoint->imlSegment = imlSegment;
segmentPoint->index = index;
@ -2944,7 +2944,7 @@ void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint,
imlSegment->segmentPointList = segmentPoint;
}
void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint)
void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint)
{
if (segmentPoint->prev)
segmentPoint->prev->next = segmentPoint->next;
@ -2975,7 +2975,7 @@ void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index,
// update position of segment points
if (imlSegment->segmentPointList)
{
ppcRecompilerSegmentPoint_t* segmentPoint = imlSegment->segmentPointList;
IMLSegmentPoint* segmentPoint = imlSegment->segmentPointList;
while (segmentPoint)
{
if (segmentPoint->index != RA_INTER_RANGE_START && segmentPoint->index != RA_INTER_RANGE_END)
@ -3017,21 +3017,6 @@ void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint3
ppcImlGenContext->segmentList2[index + i] = new IMLSegment();
}
void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext)
{
if (ppcImlGenContext->imlList)
{
free(ppcImlGenContext->imlList);
ppcImlGenContext->imlList = nullptr;
}
for (IMLSegment* imlSegment : ppcImlGenContext->segmentList2)
{
delete imlSegment;
}
ppcImlGenContext->segmentList2.clear();
}
bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
{
bool unsupportedInstructionFound = false;
@ -3953,9 +3938,7 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
ppcImlGenContext.ppcAddressOfCurrentInstruction = 0; // reset current instruction offset (any future generated IML instruction will be assigned to ppc address 0)
if( unsupportedInstructionCount > 0 || unsupportedInstructionFound )
{
// could not compile function
debug_printf("Failed recompile due to unknown instruction at 0x%08x\n", unsupportedInstructionLastOffset);
PPCRecompiler_freeContext(&ppcImlGenContext);
return false;
}
// optimize unused jumpmarks away
@ -4260,16 +4243,20 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
segIt->imlList[0].op_macro.param = cycleCount;
}
}
return true;
}
void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext)
{
// find segments that have a (conditional) jump instruction that points in reverse direction of code flow
// for these segments there is a risk that the recompiler could get trapped in an infinite busy loop.
// todo: We should do a loop-detection prepass where we flag segments that are actually in a loop. We can then use this information below to avoid generating the scheduler-exit code for segments that aren't actually in a loop despite them referencing an earlier segment (which could be an exit segment for example)
uint32 currentLoopEscapeJumpMarker = 0xFF000000; // start in an area where no valid code can be located
for(size_t s=0; s<ppcImlGenContext.segmentList2.size(); s++)
for (size_t s = 0; s < ppcImlGenContext.segmentList2.size(); s++)
{
// todo: This currently uses segment->ppcAddrMin which isn't really reliable. (We already had a problem where function inlining would generate falsified segment ranges by omitting the branch instruction). Find a better solution (use jumpmark/enterable offsets?)
IMLSegment* imlSegment = ppcImlGenContext.segmentList2[s];
if( imlSegment->imlList.empty() )
if (imlSegment->imlList.empty())
continue;
if (imlSegment->imlList[imlSegment->imlList.size() - 1].type != PPCREC_IML_TYPE_CJUMP || imlSegment->imlList[imlSegment->imlList.size() - 1].op_conditionalJump.jumpmarkAddress > imlSegment->ppcAddrMin)
continue;
@ -4289,12 +4276,12 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
PPCRecompilerIml_insertSegments(&ppcImlGenContext, s, 2);
imlSegment = NULL;
IMLSegment* imlSegmentP0 = ppcImlGenContext.segmentList2[s+0];
IMLSegment* imlSegmentP1 = ppcImlGenContext.segmentList2[s+1];
IMLSegment* imlSegmentP2 = ppcImlGenContext.segmentList2[s+2];
IMLSegment* imlSegmentP0 = ppcImlGenContext.segmentList2[s + 0];
IMLSegment* imlSegmentP1 = ppcImlGenContext.segmentList2[s + 1];
IMLSegment* imlSegmentP2 = ppcImlGenContext.segmentList2[s + 2];
// create entry point segment
PPCRecompilerIml_insertSegments(&ppcImlGenContext, ppcImlGenContext.segmentList2.size(), 1);
IMLSegment* imlSegmentPEntry = ppcImlGenContext.segmentList2[ppcImlGenContext.segmentList2.size()-1];
IMLSegment* imlSegmentPEntry = ppcImlGenContext.segmentList2[ppcImlGenContext.segmentList2.size() - 1];
// relink segments
IMLSegment_RelinkInputSegment(imlSegmentP2, imlSegmentP0);
IMLSegment_SetLinkBranchNotTaken(imlSegmentP0, imlSegmentP1);
@ -4322,7 +4309,7 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
imlSegmentP2->ppcAddrMin = 0;
imlSegmentP2->ppcAddrMax = 0;
// setup enterable segment
if( enterPPCAddress != 0 && enterPPCAddress != 0xFFFFFFFF )
if (enterPPCAddress != 0 && enterPPCAddress != 0xFFFFFFFF)
{
imlSegmentPEntry->isEnterable = true;
imlSegmentPEntry->ppcAddress = enterPPCAddress;
@ -4353,70 +4340,4 @@ bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext
// skip the newly created segments
s += 2;
}
// isolate entry points from function flow (enterable segments must not be the target of any other segment)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// count number of used registers
uint32 numLoadedFPRRegisters = 0;
for(uint32 i=0; i<255; i++)
{
if( ppcImlGenContext.mappedFPRRegister[i] )
numLoadedFPRRegisters++;
}
// insert name store instructions at the end of each segment but before branch instructions
for (IMLSegment* segIt : ppcImlGenContext.segmentList2)
{
if(segIt->imlList.size() == 0 )
continue; // ignore empty segments
// analyze segment for register usage
IMLUsedRegisters registersUsed;
for(sint32 i=0; i<segIt->imlList.size(); i++)
{
segIt->imlList[i].CheckRegisterUsage(&registersUsed);
sint32 accessedTempReg[5];
// intermediate FPRs
accessedTempReg[0] = registersUsed.readFPR1;
accessedTempReg[1] = registersUsed.readFPR2;
accessedTempReg[2] = registersUsed.readFPR3;
accessedTempReg[3] = registersUsed.readFPR4;
accessedTempReg[4] = registersUsed.writtenFPR1;
for(sint32 f=0; f<5; f++)
{
if( accessedTempReg[f] == -1 )
continue;
uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]];
if( regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0+32 )
{
segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true;
}
}
}
}
// merge certain float load+store patterns (must happen before FPR register remapping)
PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext);
if (numLoadedFPRRegisters > 0)
{
if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false)
{
PPCRecompiler_freeContext(&ppcImlGenContext);
return false;
}
}
IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext);
// remove redundant name load and store instructions
PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
return true;
}

View file

@ -8,6 +8,11 @@
#include "util/MemMapper/MemMapper.h"
#include "Common/cpu_features.h"
bool s_hasLZCNTSupport = false;
bool s_hasMOVBESupport = false;
bool s_hasBMI2Support = false;
bool s_hasAVXSupport = false;
sint32 x64Gen_registerMap[12] = // virtual GPR to x64 register mapping
{
REG_RAX, REG_RDX, REG_RBX, REG_RBP, REG_RSI, REG_RDI, REG_R8, REG_R9, REG_R10, REG_R11, REG_R12, REG_RCX
@ -351,13 +356,6 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
sint32 realRegisterMem2 = PPC_REC_INVALID_REGISTER;
if( indexed )
realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2);
if( false )//imlInstruction->op_storeLoad.flags & PPCREC_IML_OP_FLAG_FASTMEMACCESS )
{
// load u8/u16/u32 via direct memory access + optional sign extend
assert_dbg(); // todo
}
else
{
if( indexed && realRegisterMem == realRegisterMem2 )
{
return false;
@ -381,7 +379,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
{
x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2);
}
if( g_CPUFeatures.x86.movbe && switchEndian )
if( IMLBackendX64_HasExtensionMOVBE() && switchEndian )
{
if (indexed)
{
@ -419,7 +417,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
{
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
if( g_CPUFeatures.x86.movbe && switchEndian )
if(IMLBackendX64_HasExtensionMOVBE() && switchEndian )
{
x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
@ -477,7 +475,7 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
assert_dbg();
if( indexed )
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // can be replaced with LEA temp, [memReg1+memReg2] (this way we can avoid the SUB instruction after the move)
if( g_CPUFeatures.x86.movbe )
if(IMLBackendX64_HasExtensionMOVBE())
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32);
if( indexed && realRegisterMem != realRegisterData )
@ -495,8 +493,6 @@ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, p
else
return false;
return true;
}
return false;
}
/*
@ -510,13 +506,6 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
if (indexed)
realRegisterMem2 = tempToRealRegister(imlInstruction->op_storeLoad.registerMem2);
if (false)//imlInstruction->op_storeLoad.flags & PPCREC_IML_OP_FLAG_FASTMEMACCESS )
{
// load u8/u16/u32 via direct memory access + optional sign extend
assert_dbg(); // todo
}
else
{
if (indexed && realRegisterMem == realRegisterMem2)
{
return false;
@ -537,7 +526,7 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
if (indexed)
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
uint32 valueRegister;
if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData)
if ((swapEndian == false || IMLBackendX64_HasExtensionMOVBE()) && realRegisterMem != realRegisterData)
{
valueRegister = realRegisterData;
}
@ -546,11 +535,11 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData);
valueRegister = REG_RESV_TEMP;
}
if (g_CPUFeatures.x86.movbe == false && swapEndian)
if (!IMLBackendX64_HasExtensionMOVBE() && swapEndian)
x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister);
if (indexed)
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
if (g_CPUFeatures.x86.movbe && swapEndian)
if (IMLBackendX64_HasExtensionMOVBE() && swapEndian)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister);
@ -671,8 +660,6 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction,
else
return false;
return true;
}
return false;
}
bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
@ -781,7 +768,8 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp
// count leading zeros
PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext);
cemu_assert_debug(imlInstruction->crRegister == PPC_REC_INVALID_REGISTER);
if( g_CPUFeatures.x86.lzcnt )
// LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5])
if(IMLBackendX64_HasExtensionLZCNT())
{
x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, tempToRealRegister(imlInstruction->op_r_r.registerResult), tempToRealRegister(imlInstruction->op_r_r.registerA));
}
@ -1499,12 +1487,12 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction,
sint32 rRegOperand1 = tempToRealRegister(imlInstruction->op_r_r_r.registerA);
sint32 rRegOperand2 = tempToRealRegister(imlInstruction->op_r_r_r.registerB);
if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW)
if (IMLBackendX64_HasExtensionBMI2() && imlInstruction->operation == PPCREC_IML_OP_SRW)
{
// use BMI2 SHRX if available
x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);
}
else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW)
else if (IMLBackendX64_HasExtensionBMI2() && imlInstruction->operation == PPCREC_IML_OP_SLW)
{
// use BMI2 SHLX if available
x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2);
@ -2657,3 +2645,78 @@ void PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions()
PPCRecompiler_leaveRecompilerCode_visited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode();
cemu_assert_debug(PPCRecompiler_leaveRecompilerCode_unvisited != PPCRecompiler_leaveRecompilerCode_visited);
}
bool IMLBackendX64_HasExtensionLZCNT()
{
return s_hasLZCNTSupport;
}
bool IMLBackendX64_HasExtensionMOVBE()
{
return s_hasMOVBESupport;
}
bool IMLBackendX64_HasExtensionBMI2()
{
return s_hasBMI2Support;
}
bool IMLBackendX64_HasExtensionAVX()
{
return s_hasAVXSupport;
}
void IMLBackendX64_Init()
{
// init x64 recompiler instance data
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000;
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
// query processor extensions
int cpuInfo[4];
cpuid(cpuInfo, 0x80000001);
s_hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0;
cpuid(cpuInfo, 0x1);
s_hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0;
s_hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0;
cpuidex(cpuInfo, 0x7, 0);
s_hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0;
forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", s_hasLZCNTSupport ? "LZCNT " : "", s_hasMOVBESupport ? "MOVBE " : "", s_hasAVXSupport ? "AVX " : "");
}

View file

@ -131,6 +131,12 @@ enum
#define PPC_X64_GPR_USABLE_REGISTERS (16-4)
#define PPC_X64_FPR_USABLE_REGISTERS (16-1) // Use XMM0 - XMM14, XMM15 is the temp register
void IMLBackendX64_Init();
bool IMLBackendX64_HasExtensionLZCNT();
bool IMLBackendX64_HasExtensionMOVBE();
bool IMLBackendX64_HasExtensionBMI2();
bool IMLBackendX64_HasExtensionAVX();
bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext);

View file

@ -87,7 +87,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
{
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg);
if (g_CPUFeatures.x86.movbe)
if (IMLBackendX64_HasExtensionMOVBE())
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
}
@ -99,7 +99,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
}
else
{
if (g_CPUFeatures.x86.movbe)
if (IMLBackendX64_HasExtensionMOVBE())
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
}
@ -109,7 +109,7 @@ void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext,
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
}
}
if (g_CPUFeatures.x86.avx)
if (IMLBackendX64_HasExtensionAVX())
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP);
}
@ -281,29 +281,21 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
{
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem2);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem);
if( g_CPUFeatures.x86.movbe )
if(IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32);
}
else
{
if( g_CPUFeatures.x86.movbe )
if(IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
else
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32);
}
if( g_CPUFeatures.x86.movbe == false )
if(IMLBackendX64_HasExtensionMOVBE() == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( g_CPUFeatures.x86.avx )
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, realRegisterXMM, REG_RESV_TEMP);
}
else
{
x64Emit_mov_mem32_reg64(x64GenContext, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR), REG_RESV_TEMP);
x64Gen_movddup_xmmReg_memReg64(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
if (imlInstruction->op_storeLoad.flags2.notExpanded)
{
@ -317,7 +309,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
}
else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 )
{
if( g_CPUFeatures.x86.avx )
if( IMLBackendX64_HasExtensionAVX() )
{
if( indexed )
{
@ -420,23 +412,15 @@ void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext
if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0)
{
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
if (g_CPUFeatures.x86.movbe == false)
if (IMLBackendX64_HasExtensionMOVBE() == false)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed)
{
cemu_assert_debug(memReg != memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
}
if (g_CPUFeatures.x86.movbe)
if (IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, memReg, memImmS32, REG_RESV_TEMP);
@ -605,30 +589,14 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
if (imlInstruction->op_storeLoad.flags2.notExpanded)
{
// value is already in single format
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
}
else
{
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, realRegisterXMM);
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
}
if( g_CPUFeatures.x86.movbe == false )
if(IMLBackendX64_HasExtensionMOVBE() == false )
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( indexed )
{
@ -636,7 +604,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
assert_dbg();
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
if( g_CPUFeatures.x86.movbe )
if(IMLBackendX64_HasExtensionMOVBE())
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
@ -668,16 +636,8 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
}
}
else if( mode == PPCREC_FPR_ST_MODE_UI32_FROM_PS0 )
{
if( g_CPUFeatures.x86.avx )
{
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, realRegisterXMM);
}
else
{
x64Gen_movsd_memReg64_xmmReg(x64GenContext, realRegisterXMM, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
x64Emit_mov_reg64_mem32(x64GenContext, REG_RESV_TEMP, REG_RSP, offsetof(PPCInterpreter_t, temporaryFPR));
}
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if( indexed )
{
@ -1057,7 +1017,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
{
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandB);
}
else if (g_CPUFeatures.x86.avx)
else if (IMLBackendX64_HasExtensionAVX())
{
x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, imlInstruction->op_fpr_r_r_r.registerResult, imlInstruction->op_fpr_r_r_r.registerOperandA, imlInstruction->op_fpr_r_r_r.registerOperandB);
}