Cemu/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp

732 lines
No EOL
26 KiB
C++

#include "Cafe/HW/Espresso/Interpreter/PPCInterpreterInternal.h"
#include "PPCFunctionBoundaryTracker.h"
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
#include "Cafe/OS/RPL/rpl.h"
#include "util/containers/RangeStore.h"
#include "Cafe/OS/libs/coreinit/coreinit_CodeGen.h"
#include "config/ActiveSettings.h"
#include "config/LaunchSettings.h"
#include "Common/ExceptionHandler/ExceptionHandler.h"
#include "Common/cpu_features.h"
#include "util/helpers/fspinlock.h"
#include "util/helpers/helpers.h"
#include "util/MemMapper/MemMapper.h"
#include "IML/IML.h"
#include "IML/IMLRegisterAllocator.h"
#include "BackendX64/BackendX64.h"
struct PPCInvalidationRange
{
MPTR startAddress;
uint32 size;
PPCInvalidationRange(MPTR _startAddress, uint32 _size) : startAddress(_startAddress), size(_size) {};
};
struct
{
FSpinlock recompilerSpinlock;
std::queue<MPTR> targetQueue;
std::vector<PPCInvalidationRange> invalidationRanges;
}PPCRecompilerState;
RangeStore<PPCRecFunction_t*, uint32, 7703, 0x2000> rangeStore_ppcRanges;
void ATTR_MS_ABI (*PPCRecompiler_enterRecompilerCode)(uint64 codeMem, uint64 ppcInterpreterInstance);
void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_visited)();
void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
PPCRecompilerInstanceData_t* ppcRecompilerInstanceData;
bool ppcRecompilerEnabled = false;
// this function does never block and can fail if the recompiler lock cannot be acquired immediately
void PPCRecompiler_visitAddressNoBlock(uint32 enterAddress)
{
// quick read-only check without lock
if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] != PPCRecompiler_leaveRecompilerCode_unvisited)
return;
// try to acquire lock
if (!PPCRecompilerState.recompilerSpinlock.try_lock())
return;
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr != PPCRecompiler_leaveRecompilerCode_unvisited)
{
// was visited since previous check
PPCRecompilerState.recompilerSpinlock.unlock();
return;
}
// add to recompilation queue and flag as visited
PPCRecompilerState.targetQueue.emplace(enterAddress);
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] = PPCRecompiler_leaveRecompilerCode_visited;
PPCRecompilerState.recompilerSpinlock.unlock();
}
void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress)
{
if (ppcRecompilerEnabled == false)
return;
PPCRecompiler_visitAddressNoBlock(enterAddress);
}
void PPCRecompiler_enter(PPCInterpreter_t* hCPU, PPCREC_JUMP_ENTRY funcPtr)
{
#if BOOST_OS_WINDOWS
uint32 prevState = _controlfp(0, 0);
_controlfp(_RC_NEAR, _MCW_RC);
PPCRecompiler_enterRecompilerCode((uint64)funcPtr, (uint64)hCPU);
_controlfp(prevState, _MCW_RC);
// debug recompiler exit - useful to find frequently executed functions which couldn't be recompiled
#ifdef CEMU_DEBUG_ASSERT
if (hCPU->remainingCycles > 0 && GetAsyncKeyState(VK_F4))
{
auto t = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration_cast<std::chrono::microseconds>(t.time_since_epoch()).count();
cemuLog_log(LogType::Force, "Recompiler exit: 0x{:08x} LR: 0x{:08x} Timestamp {}.{:04}", hCPU->instructionPointer, hCPU->spr.LR, dur / 1000LL, (dur % 1000LL));
}
#endif
#else
PPCRecompiler_enterRecompilerCode((uint64)funcPtr, (uint64)hCPU);
#endif
// after leaving recompiler prematurely attempt to recompile the code at the new location
if (hCPU->remainingCycles > 0)
{
PPCRecompiler_visitAddressNoBlock(hCPU->instructionPointer);
}
}
void PPCRecompiler_attemptEnterWithoutRecompile(PPCInterpreter_t* hCPU, uint32 enterAddress)
{
cemu_assert_debug(hCPU->instructionPointer == enterAddress);
if (ppcRecompilerEnabled == false)
return;
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr != PPCRecompiler_leaveRecompilerCode_unvisited && funcPtr != PPCRecompiler_leaveRecompilerCode_visited)
{
cemu_assert_debug(ppcRecompilerInstanceData != nullptr);
PPCRecompiler_enter(hCPU, funcPtr);
}
}
void PPCRecompiler_attemptEnter(PPCInterpreter_t* hCPU, uint32 enterAddress)
{
cemu_assert_debug(hCPU->instructionPointer == enterAddress);
if (ppcRecompilerEnabled == false)
return;
if (hCPU->remainingCycles <= 0)
return;
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr == PPCRecompiler_leaveRecompilerCode_unvisited)
{
PPCRecompiler_visitAddressNoBlock(enterAddress);
}
else if (funcPtr != PPCRecompiler_leaveRecompilerCode_visited)
{
// enter
cemu_assert_debug(ppcRecompilerInstanceData != nullptr);
PPCRecompiler_enter(hCPU, funcPtr);
}
}
bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext);
PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PPCRange_t range, std::set<uint32>& entryAddresses, std::vector<std::pair<MPTR, uint32>>& entryPointsOut, PPCFunctionBoundaryTracker& boundaryTracker)
{
if (range.startAddress >= PPC_REC_CODE_AREA_END)
{
cemuLog_log(LogType::Force, "Attempting to recompile function outside of allowed code area");
return nullptr;
}
uint32 codeGenRangeStart;
uint32 codeGenRangeSize = 0;
coreinit::OSGetCodegenVirtAddrRangeInternal(codeGenRangeStart, codeGenRangeSize);
if (codeGenRangeSize != 0)
{
if (range.startAddress >= codeGenRangeStart && range.startAddress < (codeGenRangeStart + codeGenRangeSize))
{
if (coreinit::codeGenShouldAvoid())
{
return nullptr;
}
}
}
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length;
// generate intermediate code
ppcImlGenContext_t ppcImlGenContext = { 0 };
bool compiledSuccessfully = PPCRecompiler_generateIntermediateCode(ppcImlGenContext, ppcRecFunc, entryAddresses, boundaryTracker);
if (compiledSuccessfully == false)
{
delete ppcRecFunc;
return nullptr;
}
// apply passes
if (!PPCRecompiler_ApplyIMLPasses(ppcImlGenContext))
{
delete ppcRecFunc;
return nullptr;
}
//if (ppcRecFunc->ppcAddress == 0x12345678)
//{
// debug_printf("----------------------------------------\n");
// IMLDebug_Dump(&ppcImlGenContext);
// __debugbreak();
//}
// Large functions for testing (botw):
// 3B4049C
// emit x64 code
bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext);
if (x64GenerationSuccess == false)
{
return nullptr;
}
// collect list of PPC-->x64 entry points
cemuLog_log(LogType::Force, "[Recompiler] Successfully compiled {:08x} - {:08x} Segments: {}", ppcRecFunc->ppcAddress, ppcRecFunc->ppcAddress + ppcRecFunc->ppcSize, ppcImlGenContext.segmentList2.size());
cemu_assert_debug(ppcImlGenContext.imlListCount == 0);
entryPointsOut.clear();
for(IMLSegment* imlSegment : ppcImlGenContext.segmentList2)
{
if (imlSegment->isEnterable == false)
continue;
uint32 ppcEnterOffset = imlSegment->enterPPCAddress;
uint32 x64Offset = imlSegment->x64Offset;
entryPointsOut.emplace_back(ppcEnterOffset, x64Offset);
}
return ppcRecFunc;
}
void PPCRecompiler_FixLoops(ppcImlGenContext_t& ppcImlGenContext);
bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
{
PPCRecompiler_FixLoops(ppcImlGenContext);
// isolate entry points from function flow (enterable segments must not be the target of any other segment)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// count number of used registers
uint32 numLoadedFPRRegisters = 0;
for (uint32 i = 0; i < 255; i++)
{
if (ppcImlGenContext.mappedFPRRegister[i])
numLoadedFPRRegisters++;
}
// insert name store instructions at the end of each segment but before branch instructions
for (IMLSegment* segIt : ppcImlGenContext.segmentList2)
{
if (segIt->imlList.size() == 0)
continue; // ignore empty segments
// analyze segment for register usage
IMLUsedRegisters registersUsed;
for (sint32 i = 0; i < segIt->imlList.size(); i++)
{
segIt->imlList[i].CheckRegisterUsage(&registersUsed);
sint32 accessedTempReg[5];
// intermediate FPRs
accessedTempReg[0] = registersUsed.readFPR1;
accessedTempReg[1] = registersUsed.readFPR2;
accessedTempReg[2] = registersUsed.readFPR3;
accessedTempReg[3] = registersUsed.readFPR4;
accessedTempReg[4] = registersUsed.writtenFPR1;
for (sint32 f = 0; f < 5; f++)
{
if (accessedTempReg[f] == -1)
continue;
uint32 regName = ppcImlGenContext.mappedFPRRegister[accessedTempReg[f]];
if (regName >= PPCREC_NAME_FPR0 && regName < PPCREC_NAME_FPR0 + 32)
{
segIt->ppcFPRUsed[regName - PPCREC_NAME_FPR0] = true;
}
}
}
}
// merge certain float load+store patterns (must happen before FPR register remapping)
PPCRecompiler_optimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
PPCRecompiler_optimizeDirectIntegerCopies(&ppcImlGenContext);
if (numLoadedFPRRegisters > 0)
{
if (PPCRecompiler_manageFPRRegisters(&ppcImlGenContext) == false)
{
return false;
}
}
IMLRegisterAllocatorParameters raParam;
raParam.physicalRegisterPool.SetAvailable(X86_REG_RAX);
raParam.physicalRegisterPool.SetAvailable(X86_REG_RDX);
raParam.physicalRegisterPool.SetAvailable(X86_REG_RBX);
raParam.physicalRegisterPool.SetAvailable(X86_REG_RBP);
raParam.physicalRegisterPool.SetAvailable(X86_REG_RSI);
raParam.physicalRegisterPool.SetAvailable(X86_REG_RDI);
raParam.physicalRegisterPool.SetAvailable(X86_REG_R8);
raParam.physicalRegisterPool.SetAvailable(X86_REG_R9);
raParam.physicalRegisterPool.SetAvailable(X86_REG_R10);
raParam.physicalRegisterPool.SetAvailable(X86_REG_R11);
raParam.physicalRegisterPool.SetAvailable(X86_REG_R12);
raParam.physicalRegisterPool.SetAvailable(X86_REG_RCX);
IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext, raParam);
// remove redundant name load and store instructions
PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
return true;
}
bool PPCRecompiler_makeRecompiledFunctionActive(uint32 initialEntryPoint, PPCFunctionBoundaryTracker::PPCRange_t& range, PPCRecFunction_t* ppcRecFunc, std::vector<std::pair<MPTR, uint32>>& entryPoints)
{
// update jump table
PPCRecompilerState.recompilerSpinlock.lock();
// check if the initial entrypoint is still flagged for recompilation
// its possible that the range has been invalidated during the time it took to translate the function
if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[initialEntryPoint / 4] != PPCRecompiler_leaveRecompilerCode_visited)
{
PPCRecompilerState.recompilerSpinlock.unlock();
return false;
}
// check if the current range got invalidated in the time it took to recompile it
bool isInvalidated = false;
for (auto& invRange : PPCRecompilerState.invalidationRanges)
{
MPTR rStartAddr = invRange.startAddress;
MPTR rEndAddr = rStartAddr + invRange.size;
for (auto& recFuncRange : ppcRecFunc->list_ranges)
{
if (recFuncRange.ppcAddress < (rEndAddr) && (recFuncRange.ppcAddress + recFuncRange.ppcSize) >= rStartAddr)
{
isInvalidated = true;
break;
}
}
}
PPCRecompilerState.invalidationRanges.clear();
if (isInvalidated)
{
PPCRecompilerState.recompilerSpinlock.unlock();
return false;
}
// update jump table
for (auto& itr : entryPoints)
{
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[itr.first / 4] = (PPCREC_JUMP_ENTRY)((uint8*)ppcRecFunc->x86Code + itr.second);
}
// due to inlining, some entrypoints can get optimized away
// therefore we reset all addresses that are still marked as visited (but not recompiled)
// we dont remove the points from the queue but any address thats not marked as visited won't get recompiled
// if they are reachable, the interpreter will queue them again
for (uint32 v = range.startAddress; v <= (range.startAddress + range.length); v += 4)
{
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[v / 4];
if (funcPtr == PPCRecompiler_leaveRecompilerCode_visited)
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[v / 4] = PPCRecompiler_leaveRecompilerCode_unvisited;
}
// register ranges
for (auto& r : ppcRecFunc->list_ranges)
{
r.storedRange = rangeStore_ppcRanges.storeRange(ppcRecFunc, r.ppcAddress, r.ppcAddress + r.ppcSize);
}
PPCRecompilerState.recompilerSpinlock.unlock();
return true;
}
void PPCRecompiler_recompileAtAddress(uint32 address)
{
cemu_assert_debug(ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[address / 4] == PPCRecompiler_leaveRecompilerCode_visited);
// get size
PPCFunctionBoundaryTracker funcBoundaries;
funcBoundaries.trackStartPoint(address);
// get range that encompasses address
PPCFunctionBoundaryTracker::PPCRange_t range;
if (funcBoundaries.getRangeForAddress(address, range) == false)
{
cemu_assert_debug(false);
}
// todo - use info from previously compiled ranges to determine full size of this function (and merge all the entryAddresses)
// collect all currently known entry points for this range
PPCRecompilerState.recompilerSpinlock.lock();
std::set<uint32> entryAddresses;
entryAddresses.emplace(address);
PPCRecompilerState.recompilerSpinlock.unlock();
std::vector<std::pair<MPTR, uint32>> functionEntryPoints;
auto func = PPCRecompiler_recompileFunction(range, entryAddresses, functionEntryPoints, funcBoundaries);
if (!func)
{
return; // recompilation failed
}
bool r = PPCRecompiler_makeRecompiledFunctionActive(address, range, func, functionEntryPoints);
}
std::thread s_threadRecompiler;
std::atomic_bool s_recompilerThreadStopSignal{false};
void PPCRecompiler_thread()
{
SetThreadName("PPCRecompiler");
while (true)
{
if(s_recompilerThreadStopSignal)
return;
std::this_thread::sleep_for(std::chrono::milliseconds(10));
// asynchronous recompilation:
// 1) take address from queue
// 2) check if address is still marked as visited
// 3) if yes -> calculate size, gather all entry points, recompile and update jump table
while (true)
{
PPCRecompilerState.recompilerSpinlock.lock();
if (PPCRecompilerState.targetQueue.empty())
{
PPCRecompilerState.recompilerSpinlock.unlock();
break;
}
auto enterAddress = PPCRecompilerState.targetQueue.front();
PPCRecompilerState.targetQueue.pop();
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr != PPCRecompiler_leaveRecompilerCode_visited)
{
// only recompile functions if marked as visited
PPCRecompilerState.recompilerSpinlock.unlock();
continue;
}
PPCRecompilerState.recompilerSpinlock.unlock();
PPCRecompiler_recompileAtAddress(enterAddress);
if(s_recompilerThreadStopSignal)
return;
}
}
}
#define PPC_REC_ALLOC_BLOCK_SIZE (4*1024*1024) // 4MB
constexpr uint32 PPCRecompiler_GetNumAddressSpaceBlocks()
{
return (MEMORY_CODEAREA_ADDR + MEMORY_CODEAREA_SIZE + PPC_REC_ALLOC_BLOCK_SIZE - 1) / PPC_REC_ALLOC_BLOCK_SIZE;
}
std::bitset<PPCRecompiler_GetNumAddressSpaceBlocks()> ppcRecompiler_reservedBlockMask;
void PPCRecompiler_reserveLookupTableBlock(uint32 offset)
{
uint32 blockIndex = offset / PPC_REC_ALLOC_BLOCK_SIZE;
offset = blockIndex * PPC_REC_ALLOC_BLOCK_SIZE;
if (ppcRecompiler_reservedBlockMask[blockIndex])
return;
ppcRecompiler_reservedBlockMask[blockIndex] = true;
void* p1 = MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->ppcRecompilerFuncTable[offset/4]), (PPC_REC_ALLOC_BLOCK_SIZE/4)*sizeof(void*), MemMapper::PAGE_PERMISSION::P_RW, true);
void* p3 = MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset/4]), (PPC_REC_ALLOC_BLOCK_SIZE/4)*sizeof(void*), MemMapper::PAGE_PERMISSION::P_RW, true);
if( !p1 || !p3 )
{
cemuLog_log(LogType::Force, "Failed to allocate memory for recompiler (0x{:08x})", offset);
cemu_assert(false);
return;
}
for(uint32 i=0; i<PPC_REC_ALLOC_BLOCK_SIZE/4; i++)
{
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset/4+i] = PPCRecompiler_leaveRecompilerCode_unvisited;
}
}
void PPCRecompiler_allocateRange(uint32 startAddress, uint32 size)
{
if (ppcRecompilerInstanceData == nullptr)
return;
uint32 endAddress = (startAddress + size + PPC_REC_ALLOC_BLOCK_SIZE - 1) & ~(PPC_REC_ALLOC_BLOCK_SIZE-1);
startAddress = (startAddress) & ~(PPC_REC_ALLOC_BLOCK_SIZE-1);
startAddress = std::min(startAddress, (uint32)MEMORY_CODEAREA_ADDR + MEMORY_CODEAREA_SIZE);
endAddress = std::min(endAddress, (uint32)MEMORY_CODEAREA_ADDR + MEMORY_CODEAREA_SIZE);
for (uint32 i = startAddress; i < endAddress; i += PPC_REC_ALLOC_BLOCK_SIZE)
{
PPCRecompiler_reserveLookupTableBlock(i);
}
}
struct ppcRecompilerFuncRange_t
{
MPTR ppcStart;
uint32 ppcSize;
void* x86Start;
size_t x86Size;
};
bool PPCRecompiler_findFuncRanges(uint32 addr, ppcRecompilerFuncRange_t* rangesOut, size_t* countInOut)
{
PPCRecompilerState.recompilerSpinlock.lock();
size_t countIn = *countInOut;
size_t countOut = 0;
rangeStore_ppcRanges.findRanges(addr, addr + 4, [rangesOut, countIn, &countOut](uint32 start, uint32 end, PPCRecFunction_t* func)
{
if (countOut < countIn)
{
rangesOut[countOut].ppcStart = start;
rangesOut[countOut].ppcSize = (end-start);
rangesOut[countOut].x86Start = func->x86Code;
rangesOut[countOut].x86Size = func->x86Size;
}
countOut++;
}
);
PPCRecompilerState.recompilerSpinlock.unlock();
*countInOut = countOut;
if (countOut > countIn)
return false;
return true;
}
extern "C" DLLEXPORT uintptr_t * PPCRecompiler_getJumpTableBase()
{
if (ppcRecompilerInstanceData == nullptr)
return nullptr;
return (uintptr_t*)ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable;
}
void PPCRecompiler_invalidateTableRange(uint32 offset, uint32 size)
{
if (ppcRecompilerInstanceData == nullptr)
return;
for (uint32 i = 0; i < size / 4; i++)
{
ppcRecompilerInstanceData->ppcRecompilerFuncTable[offset / 4 + i] = nullptr;
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset / 4 + i] = PPCRecompiler_leaveRecompilerCode_unvisited;
}
}
void PPCRecompiler_deleteFunction(PPCRecFunction_t* func)
{
// assumes PPCRecompilerState.recompilerSpinlock is already held
cemu_assert_debug(PPCRecompilerState.recompilerSpinlock.is_locked());
for (auto& r : func->list_ranges)
{
PPCRecompiler_invalidateTableRange(r.ppcAddress, r.ppcSize);
if(r.storedRange)
rangeStore_ppcRanges.deleteRange(r.storedRange);
r.storedRange = nullptr;
}
// todo - free x86 code
}
void PPCRecompiler_invalidateRange(uint32 startAddr, uint32 endAddr)
{
if (ppcRecompilerEnabled == false)
return;
if (startAddr >= PPC_REC_CODE_AREA_SIZE)
return;
cemu_assert_debug(endAddr >= startAddr);
PPCRecompilerState.recompilerSpinlock.lock();
uint32 rStart;
uint32 rEnd;
PPCRecFunction_t* rFunc;
// mark range as unvisited
for (uint64 currentAddr = (uint64)startAddr&~3; currentAddr < (uint64)(endAddr&~3); currentAddr += 4)
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[currentAddr / 4] = PPCRecompiler_leaveRecompilerCode_unvisited;
// add entry to invalidation queue
PPCRecompilerState.invalidationRanges.emplace_back(startAddr, endAddr-startAddr);
while (rangeStore_ppcRanges.findFirstRange(startAddr, endAddr, rStart, rEnd, rFunc) )
{
PPCRecompiler_deleteFunction(rFunc);
}
PPCRecompilerState.recompilerSpinlock.unlock();
}
#if defined(ARCH_X86_64)
void PPCRecompiler_initPlatform()
{
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000;
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
}
#else
void PPCRecompiler_initPlatform()
{
}
#endif
void PPCRecompiler_init()
{
if (ActiveSettings::GetCPUMode() == CPUMode::SinglecoreInterpreter)
{
ppcRecompilerEnabled = false;
return;
}
if (LaunchSettings::ForceInterpreter())
{
cemuLog_log(LogType::Force, "Recompiler disabled. Command line --force-interpreter was passed");
return;
}
if (ppcRecompilerInstanceData)
{
MemMapper::FreeReservation(ppcRecompilerInstanceData, sizeof(PPCRecompilerInstanceData_t));
ppcRecompilerInstanceData = nullptr;
}
debug_printf("Allocating %dMB for recompiler instance data...\n", (sint32)(sizeof(PPCRecompilerInstanceData_t) / 1024 / 1024));
ppcRecompilerInstanceData = (PPCRecompilerInstanceData_t*)MemMapper::ReserveMemory(nullptr, sizeof(PPCRecompilerInstanceData_t), MemMapper::PAGE_PERMISSION::P_RW);
MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom), sizeof(PPCRecompilerInstanceData_t) - offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskBottom), MemMapper::PAGE_PERMISSION::P_RW, true);
PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions();
PPCRecompiler_allocateRange(0, 0x1000); // the first entry is used for fallback to interpreter
PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize());
PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize());
// setup GQR scale tables
for (uint32 i = 0; i < 32; i++)
{
float a = 1.0f / (float)(1u << i);
float b = 0;
if (i == 0)
b = 4294967296.0f;
else
b = (float)(1u << (32u - i));
float ar = (float)(1u << i);
float br = 0;
if (i == 0)
br = 1.0f / 4294967296.0f;
else
br = 1.0f / (float)(1u << (32u - i));
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 0] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 0] = b;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 0] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 1] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 0] = b;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 1] = b;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 0] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 0] = br;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 0] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 1] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 0] = br;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br;
}
PPCRecompiler_initPlatform();
cemuLog_log(LogType::Force, "Recompiler initialized");
ppcRecompilerEnabled = true;
// launch recompilation thread
s_recompilerThreadStopSignal = false;
s_threadRecompiler = std::thread(PPCRecompiler_thread);
}
void PPCRecompiler_Shutdown()
{
// shut down recompiler thread
s_recompilerThreadStopSignal = true;
if(s_threadRecompiler.joinable())
s_threadRecompiler.join();
// clean up queues
while(!PPCRecompilerState.targetQueue.empty())
PPCRecompilerState.targetQueue.pop();
PPCRecompilerState.invalidationRanges.clear();
// clean range store
rangeStore_ppcRanges.clear();
// clean up memory
uint32 numBlocks = PPCRecompiler_GetNumAddressSpaceBlocks();
for(uint32 i=0; i<numBlocks; i++)
{
if(!ppcRecompiler_reservedBlockMask[i])
continue;
// deallocate
uint64 offset = i * PPC_REC_ALLOC_BLOCK_SIZE;
MemMapper::FreeMemory(&(ppcRecompilerInstanceData->ppcRecompilerFuncTable[offset/4]), (PPC_REC_ALLOC_BLOCK_SIZE/4)*sizeof(void*), true);
MemMapper::FreeMemory(&(ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset/4]), (PPC_REC_ALLOC_BLOCK_SIZE/4)*sizeof(void*), true);
// mark as unmapped
ppcRecompiler_reservedBlockMask[i] = false;
}
}