PPCRec: Fixes and optimizations + rework FRES/FRSQRTE

This commit is contained in:
Exzap 2024-10-23 08:36:12 +02:00
parent 89f8f9bd2a
commit f94f99546d
13 changed files with 408 additions and 354 deletions

View file

@ -32,7 +32,7 @@ espresso_frsqrte_entry_t frsqrteLookupTable[32] =
{0x20c1000, 0x35e},{0x1f12000, 0x332},{0x1d79000, 0x30a},{0x1bf4000, 0x2e6},
};
double frsqrte_espresso(double input)
ATTR_MS_ABI double frsqrte_espresso(double input)
{
unsigned long long x = *(unsigned long long*)&input;
@ -111,7 +111,7 @@ espresso_fres_entry_t fresLookupTable[32] =
{0x88400, 0x11a}, {0x65000, 0x11a}, {0x41c00, 0x108}, {0x20c00, 0x106}
};
double fres_espresso(double input)
ATTR_MS_ABI double fres_espresso(double input)
{
// based on testing we know that fres uses only the first 15 bits of the mantissa
// seee eeee eeee mmmm mmmm mmmm mmmx xxxx .... (s = sign, e = exponent, m = mantissa, x = not used)

View file

@ -191,8 +191,8 @@ inline double roundTo25BitAccuracy(double d)
return *(double*)&v;
}
double fres_espresso(double input);
double frsqrte_espresso(double input);
ATTR_MS_ABI double fres_espresso(double input);
ATTR_MS_ABI double frsqrte_espresso(double input);
void fcmpu_espresso(PPCInterpreter_t* hCPU, int crfD, double a, double b);

View file

@ -601,8 +601,10 @@ void PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRe
void PPCRecompilerX64Gen_imlInstruction_call_imm(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
{
// the register allocator takes care of spilling volatile registers and moving parameters to the right registers, so we don't need to do any special handling here
x64GenContext->emitter->SUB_qi8(X86_REG_RSP, 0x28); // reserve enough space for any parameters while keeping stack alignment of 16 intact
x64GenContext->emitter->MOV_qi64(X86_REG_RAX, imlInstruction->op_call_imm.callAddress);
x64GenContext->emitter->CALL_q(X86_REG_RAX);
x64GenContext->emitter->ADD_qi8(X86_REG_RSP, 0x28);
}
bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)

View file

@ -780,18 +780,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
// move to FPR register
x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP )
{
// move register to XMM15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// call assembly routine to calculate accurate FRES result in XMM15
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_fres);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP);
// copy result to bottom and top half of result register
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT)
{
// move register to XMM15

View file

@ -363,7 +363,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED ||
operation == PPCREC_IML_OP_ASSIGN ||
operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_NEGATE_PAIR ||
operation == PPCREC_IML_OP_FPR_ABS_PAIR ||
operation == PPCREC_IML_OP_FPR_FRES_PAIR ||

View file

@ -143,7 +143,6 @@ enum
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, // calculate reciprocal with Espresso accuracy of source bottom half and write result to destination bottom and top half
PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated

View file

@ -10,9 +10,16 @@
#include <boost/container/static_vector.hpp>
#include <boost/container/small_vector.hpp>
#include "Common/cpu_features.h"
#define DEBUG_RA_EXTRA_VALIDATION 0 // if set to non-zero, additional expensive validation checks will be performed
#define DEBUG_RA_INSTRUCTION_GEN 0
struct IMLRARegAbstractLiveness // preliminary liveness info. One entry per register and segment
{
IMLRARegAbstractLiveness(IMLRegFormat regBaseFormat, sint32 usageStart, sint32 usageEnd) : regBaseFormat(regBaseFormat), usageStart(usageStart), usageEnd(usageEnd) {};
IMLRARegAbstractLiveness(IMLRegFormat regBaseFormat, sint32 usageStart, sint32 usageEnd)
: regBaseFormat(regBaseFormat), usageStart(usageStart), usageEnd(usageEnd) {};
void TrackInstruction(sint32 index)
{
@ -34,7 +41,6 @@ struct IMLRegisterAllocatorContext
std::unordered_map<IMLRegID, IMLRegFormat> regIdToBaseFormat; // a vector would be more efficient but it also means that reg ids have to be continuous and not completely arbitrary
// first pass
std::vector<std::unordered_map<IMLRegID, IMLRARegAbstractLiveness>> perSegmentAbstractRanges;
// second pass
// helper methods
inline std::unordered_map<IMLRegID, IMLRARegAbstractLiveness>& GetSegmentAbstractRangeMap(IMLSegment* imlSegment)
@ -48,38 +54,117 @@ struct IMLRegisterAllocatorContext
cemu_assert_debug(it != regIdToBaseFormat.cend());
return it->second;
}
};
struct IMLFixedRegisters
{
struct Entry
{
Entry(IMLReg reg, IMLPhysRegisterSet physRegSet) : reg(reg), physRegSet(physRegSet) {}
Entry(IMLReg reg, IMLPhysRegisterSet physRegSet)
: reg(reg), physRegSet(physRegSet) {}
IMLReg reg;
IMLPhysRegisterSet physRegSet;
};
boost::container::small_vector<Entry, 4> listInput; // fixed registers for instruction input edge
boost::container::small_vector<Entry, 4> listOutput; // fixed registers for instruction output edge
boost::container::small_vector<Entry, 4> listInput; // fixed register requirements for instruction input edge
boost::container::small_vector<Entry, 4> listOutput; // fixed register requirements for instruction output edge
};
static void SetupCallingConvention(const IMLInstruction* instruction, IMLFixedRegisters& fixedRegs, const IMLPhysReg intParamToPhysReg[3], const IMLPhysReg floatParamToPhysReg[3], const IMLPhysReg intReturnPhysReg, const IMLPhysReg floatReturnPhysReg, IMLPhysRegisterSet volatileRegisters)
{
sint32 numIntParams = 0, numFloatParams = 0;
auto AddParameterMapping = [&](IMLReg reg) {
if (!reg.IsValid())
return;
if (reg.GetBaseFormat() == IMLRegFormat::I64)
{
IMLPhysRegisterSet ps;
ps.SetAvailable(intParamToPhysReg[numIntParams]);
fixedRegs.listInput.emplace_back(reg, ps);
numIntParams++;
}
else if (reg.GetBaseFormat() == IMLRegFormat::F64)
{
IMLPhysRegisterSet ps;
ps.SetAvailable(floatParamToPhysReg[numFloatParams]);
fixedRegs.listInput.emplace_back(reg, ps);
numFloatParams++;
}
else
{
cemu_assert_suspicious();
}
};
AddParameterMapping(instruction->op_call_imm.regParam0);
AddParameterMapping(instruction->op_call_imm.regParam1);
AddParameterMapping(instruction->op_call_imm.regParam2);
// return value
if (instruction->op_call_imm.regReturn.IsValid())
{
IMLRegFormat returnFormat = instruction->op_call_imm.regReturn.GetBaseFormat();
bool isIntegerFormat = returnFormat == IMLRegFormat::I64 || returnFormat == IMLRegFormat::I32 || returnFormat == IMLRegFormat::I16 || returnFormat == IMLRegFormat::I8;
IMLPhysRegisterSet ps;
if (isIntegerFormat)
{
ps.SetAvailable(intReturnPhysReg);
volatileRegisters.SetReserved(intReturnPhysReg);
}
else
{
ps.SetAvailable(floatReturnPhysReg);
volatileRegisters.SetReserved(floatReturnPhysReg);
}
fixedRegs.listOutput.emplace_back(instruction->op_call_imm.regReturn, ps);
}
// block volatile registers from being used on the output edge, this makes the register allocator store them during the call
fixedRegs.listOutput.emplace_back(IMLREG_INVALID, volatileRegisters);
}
#if defined(__aarch64__)
// aarch64
static void GetInstructionFixedRegisters(IMLInstruction* instruction, IMLFixedRegisters& fixedRegs)
{
fixedRegs.listInput.clear();
fixedRegs.listOutput.clear();
// code below for aarch64 has not been tested
// The purpose of GetInstructionFixedRegisters() is to constraint virtual registers to specific physical registers for instructions which need it
// on x86 this is used for instructions like SHL <reg>, CL where the CL register is hardwired. On aarch it's probably only necessary for setting up the calling convention
cemu_assert_unimplemented();
#ifdef 0
if (instruction->type == PPCREC_IML_TYPE_CALL_IMM)
{
const IMLPhysReg intParamToPhysReg[3] = {IMLArchAArch64::PHYSREG_GPR_BASE + 0, IMLArchAArch64::PHYSREG_GPR_BASE + 1, IMLArchAArch64::PHYSREG_GPR_BASE + 2};
const IMLPhysReg floatParamToPhysReg[3] = {IMLArchAArch64::PHYSREG_FPR_BASE + 0, IMLArchAArch64::PHYSREG_FPR_BASE + 1, IMLArchAArch64::PHYSREG_FPR_BASE + 2};
IMLPhysRegisterSet volatileRegs;
for (int i=0; i<19; i++) // x0 to x18 are volatile
volatileRegs.SetAvailable(IMLArchAArch64::PHYSREG_GPR_BASE + i);
for (int i = 0; i <= 31; i++) // which float registers are volatile?
volatileRegs.SetAvailable(IMLArchAArch64::PHYSREG_FPR_BASE + i);
SetupCallingConvention(instruction, fixedRegs, intParamToPhysReg, floatParamToPhysReg, IMLArchAArch64::PHYSREG_GPR_BASE + 0, IMLArchAArch64::PHYSREG_FPR_BASE + 0, volatileRegs);
}
#endif
}
#else
// x86-64
static void GetInstructionFixedRegisters(IMLInstruction* instruction, IMLFixedRegisters& fixedRegs)
{
fixedRegs.listInput.clear();
fixedRegs.listOutput.clear();
// x86 specific logic is hardcoded for now
if (instruction->type == PPCREC_IML_TYPE_R_R_R)
{
if (instruction->operation == PPCREC_IML_OP_LEFT_SHIFT || instruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S || instruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U)
{
// todo: We can skip this if g_CPUFeatures.x86.bmi2 is set, but for now we just assume it's not so we can properly test increased register pressure
if(!g_CPUFeatures.x86.bmi2)
{
IMLPhysRegisterSet ps;
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_ECX);
fixedRegs.listInput.emplace_back(instruction->op_r_r_r.regB, ps);
}
}
}
else if (instruction->type == PPCREC_IML_TYPE_ATOMIC_CMP_STORE)
{
IMLPhysRegisterSet ps;
@ -88,38 +173,24 @@ static void GetInstructionFixedRegisters(IMLInstruction* instruction, IMLFixedRe
}
else if (instruction->type == PPCREC_IML_TYPE_CALL_IMM)
{
// parameters (todo)
cemu_assert_debug(!instruction->op_call_imm.regParam0.IsValid());
cemu_assert_debug(!instruction->op_call_imm.regParam1.IsValid());
cemu_assert_debug(!instruction->op_call_imm.regParam2.IsValid());
// return value
if(instruction->op_call_imm.regReturn.IsValid())
{
IMLRegFormat returnFormat = instruction->op_call_imm.regReturn.GetBaseFormat();
bool isIntegerFormat = returnFormat == IMLRegFormat::I64 || returnFormat == IMLRegFormat::I32 || returnFormat == IMLRegFormat::I16 || returnFormat == IMLRegFormat::I8;
cemu_assert_debug(isIntegerFormat); // float return values are still todo
IMLPhysRegisterSet ps;
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_EAX);
fixedRegs.listOutput.emplace_back(instruction->op_call_imm.regReturn, ps);
}
// block volatile registers from being used on the output edge, this makes the RegAlloc store them during the call
IMLPhysRegisterSet ps;
if(!instruction->op_call_imm.regReturn.IsValid())
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_RAX);
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_RCX);
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_RDX);
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_R8);
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_R9);
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_R10);
ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_R11);
const IMLPhysReg intParamToPhysReg[3] = {IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RCX, IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RDX, IMLArchX86::PHYSREG_GPR_BASE + X86_REG_R8};
const IMLPhysReg floatParamToPhysReg[3] = {IMLArchX86::PHYSREG_FPR_BASE + 0, IMLArchX86::PHYSREG_FPR_BASE + 1, IMLArchX86::PHYSREG_FPR_BASE + 2};
IMLPhysRegisterSet volatileRegs;
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RAX);
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RCX);
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RDX);
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_R8);
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_R9);
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_R10);
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_R11);
// YMM0-YMM5 are volatile
for (int i = 0; i <= 5; i++)
ps.SetAvailable(IMLArchX86::PHYSREG_FPR_BASE+i); // YMM0-YMM5 are volatile
volatileRegs.SetAvailable(IMLArchX86::PHYSREG_FPR_BASE + i);
// for YMM6-YMM15 only the upper 128 bits are volatile which we dont use
fixedRegs.listOutput.emplace_back(IMLREG_INVALID, ps);
SetupCallingConvention(instruction, fixedRegs, intParamToPhysReg, floatParamToPhysReg, IMLArchX86::PHYSREG_GPR_BASE + X86_REG_EAX, IMLArchX86::PHYSREG_FPR_BASE + 0, volatileRegs);
}
}
#endif
uint32 PPCRecRA_getNextIterationIndex()
{
@ -315,32 +386,6 @@ struct IMLRALivenessTimeline
activeRanges.emplace_back(subrange);
}
// remove all ranges from activeRanges with end <= instructionIndex
void ExpireRanges(sint32 instructionIndex)
{
__debugbreak(); // maybe replace calls with raInstructionEdge variant?
expiredRanges.clear();
size_t count = activeRanges.size();
for (size_t f = 0; f < count; f++)
{
raLivenessRange* liverange = activeRanges[f];
if (liverange->interval2.end.GetInstructionIndex() < instructionIndex) // <= to < since end is now inclusive
{
#ifdef CEMU_DEBUG_ASSERT
if (instructionIndex != RA_INTER_RANGE_END && (liverange->subrangeBranchTaken || liverange->subrangeBranchNotTaken))
assert_dbg(); // infinite subranges should not expire
#endif
expiredRanges.emplace_back(liverange);
// remove entry
activeRanges[f] = activeRanges[count-1];
f--;
count--;
}
}
if(count != activeRanges.size())
activeRanges.resize(count);
}
void ExpireRanges(raInstructionEdge expireUpTo)
{
expiredRanges.clear();
@ -425,7 +470,10 @@ void PPCRecRA_MaskOverlappingPhysRegForGlobalRange(raLivenessRange* range2, IMLP
}
}
bool _livenessRangeStartCompare(raLivenessRange* lhs, raLivenessRange* rhs) { return lhs->interval2.start < rhs->interval2.start; }
bool _livenessRangeStartCompare(raLivenessRange* lhs, raLivenessRange* rhs)
{
return lhs->interval2.start < rhs->interval2.start;
}
void _sortSegmentAllSubrangesLinkedList(IMLSegment* imlSegment)
{
@ -460,7 +508,7 @@ void _sortSegmentAllSubrangesLinkedList(IMLSegment* imlSegment)
subrangeList[i]->link_allSegmentRanges.next = subrangeList[i + 1];
}
// validate list
#ifdef CEMU_DEBUG_ASSERT
#if DEBUG_RA_EXTRA_VALIDATION
sint32 count2 = 0;
subrangeItr = imlSegment->raInfo.linkedList_allSubranges;
raInstructionEdge currentStartPosition;
@ -544,7 +592,7 @@ boost::container::small_vector<raLivenessRange*, 8> IMLRA_GetRangeWithFixedRegRe
void IMLRA_HandleFixedRegisters(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment)
{
// first pass - iterate over all ranges with fixed register requirements and split them if they cross the segment border
// todo - this can be optimized. Ranges only need to be split if there are conflicts with other segments. Note that below passes rely on the fact that this pass currently splits all ranges with fixed register requirements
// todo - this pass currently creates suboptimal results by splitting all ranges that cross the segment border if they have any fixed register requirement. This isn't always necessary
for (raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange;)
{
IMLPhysRegisterSet allowedRegs;
@ -608,7 +656,6 @@ void IMLRA_HandleFixedRegisters(ppcImlGenContext_t* ppcImlGenContext, IMLSegment
PPCRecRA_splitLocalSubrange2(ppcImlGenContext, range, entry.pos, true);
}
}
}
}
// finally iterate ranges and assign fixed registers
@ -626,7 +673,7 @@ void IMLRA_HandleFixedRegisters(ppcImlGenContext_t* ppcImlGenContext, IMLSegment
currentRange->SetPhysicalRegister(allowedRegs.GetFirstAvailableReg());
}
// DEBUG - check for collisions and make sure all ranges with fixed register requirements got their physical register assigned
#ifdef CEMU_DEBUG_ASSERT
#if DEBUG_RA_EXTRA_VALIDATION
for (raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next)
{
IMLPhysRegisterSet allowedRegs;
@ -964,7 +1011,6 @@ private:
} explodeRange;
};
class RASpillStrategy_ExplodeRangeInter : public RASpillStrategy
{
public:
@ -1108,15 +1154,13 @@ bool IMLRA_AssignSegmentRegisters(IMLRegisterAllocatorContext& ctx, ppcImlGenCon
raInstructionEdge currentRangeStart = subrangeItr->interval2.start; // used to be currentIndex before refactor
PPCRecRA_debugValidateSubrange(subrangeItr);
// below used to be: std::min<sint32>(currentIndex, RA_INTER_RANGE_END-1)
livenessTimeline.ExpireRanges((currentRangeStart > lastInstructionEdge) ? lastInstructionEdge : currentRangeStart); // expire up to currentIndex (inclusive), but exclude infinite ranges
// note: The logic here is complicated in regards to whether the instruction index should be inclusive or exclusive. Find a way to simplify?
// if subrange already has register assigned then add it to the active list and continue
if (subrangeItr->GetPhysicalRegister() >= 0)
{
// verify if register is actually available
#ifdef CEMU_DEBUG_ASSERT
#if DEBUG_RA_EXTRA_VALIDATION
for (auto& liverangeItr : livenessTimeline.activeRanges)
{
// check for register mismatch
@ -1176,8 +1220,7 @@ bool IMLRA_AssignSegmentRegisters(IMLRegisterAllocatorContext& ctx, ppcImlGenCon
// cant assign register
// there might be registers available, we just can't use them due to range conflicts
RASpillStrategy* selectedStrategy = nullptr;
auto SelectStrategyIfBetter = [&selectedStrategy](RASpillStrategy& newStrategy)
{
auto SelectStrategyIfBetter = [&selectedStrategy](RASpillStrategy& newStrategy) {
if (newStrategy.GetCost() == INT_MAX)
return;
if (selectedStrategy == nullptr || newStrategy.GetCost() < selectedStrategy->GetCost())
@ -1366,9 +1409,7 @@ raLivenessRange* PPCRecRA_convertToMappedRanges(IMLRegisterAllocatorContext& ctx
}
abstractRange->isProcessed = true;
// create subrange
#ifdef CEMU_DEBUG_ASSERT
cemu_assert_debug(IMLRA_GetSubrange(imlSegment, vGPR) == nullptr);
#endif
cemu_assert_debug(
(abstractRange->usageStart == abstractRange->usageEnd && (abstractRange->usageStart == RA_INTER_RANGE_START || abstractRange->usageStart == RA_INTER_RANGE_END)) ||
abstractRange->usageStart < abstractRange->usageEnd); // usageEnd is exclusive so it should always be larger
@ -1414,16 +1455,6 @@ raLivenessRange* PPCRecRA_convertToMappedRanges(IMLRegisterAllocatorContext& ctx
PPCRecRA_convertToMappedRanges(ctx, it, vGPR, name);
}
}
// for subranges which exit the segment at the end there is a hard requirement that they cover the suffix instruction
// this is due to range load instructions being inserted before the suffix instruction
// todo - currently later steps might break this assumption, look into this
// if (subrange->interval2.ExtendsIntoNextSegment())
// {
// if (imlSegment->HasSuffixInstruction())
// {
// cemu_assert_debug(subrange->interval2.start.GetInstructionIndex() <= imlSegment->GetSuffixInstructionIndex());
// }
// }
return subrange;
}
@ -1432,8 +1463,7 @@ void IMLRA_ConvertAbstractToLivenessRanges(IMLRegisterAllocatorContext& ctx, IML
{
const std::unordered_map<IMLRegID, raLivenessRange*>& regToSubrange = IMLRA_GetSubrangeMap(imlSegment);
auto AddOrUpdateFixedRegRequirement = [&](IMLRegID regId, sint32 instructionIndex, bool isInput, const IMLPhysRegisterSet& physRegSet)
{
auto AddOrUpdateFixedRegRequirement = [&](IMLRegID regId, sint32 instructionIndex, bool isInput, const IMLPhysRegisterSet& physRegSet) {
raLivenessRange* subrange = regToSubrange.find(regId)->second;
cemu_assert_debug(subrange);
raFixedRegRequirement tmp;
@ -1624,8 +1654,12 @@ void PPCRecRA_followFlowAndExtendRanges(IMLRegisterAllocatorContext& ctx, IMLSeg
list_segments.reserve(segmentCount + 1);
list_processedSegment.resize(segmentCount);
auto markSegProcessed = [&list_processedSegment](IMLSegment* seg) {list_processedSegment[seg->momentaryIndex] = true; };
auto isSegProcessed = [&list_processedSegment](IMLSegment* seg) -> bool { return list_processedSegment[seg->momentaryIndex]; };
auto markSegProcessed = [&list_processedSegment](IMLSegment* seg) {
list_processedSegment[seg->momentaryIndex] = true;
};
auto isSegProcessed = [&list_processedSegment](IMLSegment* seg) -> bool {
return list_processedSegment[seg->momentaryIndex];
};
markSegProcessed(imlSegment);
sint32 index = 0;
@ -1730,10 +1764,8 @@ void IMLRA_AnalyzeSubrangeDataDependency(raLivenessRange* subrange)
subrange->_noLoad = true;
}
struct subrangeEndingInfo_t
{
//boost::container::small_vector<raLivenessSubrange_t*, 32> subrangeList2;
raLivenessRange* subrangeList[SUBRANGE_LIST_SIZE];
sint32 subrangeCount;
@ -1870,8 +1902,6 @@ inline IMLReg _MakeNativeReg(IMLRegFormat baseFormat, IMLRegID regId)
return IMLReg(baseFormat, baseFormat, 0, regId);
}
#define DEBUG_RA_INSTRUCTION_GEN 0
// prepass for IMLRA_GenerateSegmentMoveInstructions which updates all virtual registers to their physical counterparts
void IMLRA_RewriteRegisters(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment)
{
@ -2036,7 +2066,6 @@ void IMLRA_GenerateSegmentMoveInstructions2(IMLRegisterAllocatorContext& ctx, IM
// range expires
// we cant erase it from virtId2PhysReg right away because a store might happen before the last use (the +1 thing above)
// todo - check hasStore
raLivenessRange* storedRange = *it;
if (storedRange->hasStore)
@ -2095,8 +2124,34 @@ void IMLRA_GenerateMoveInstructions(IMLRegisterAllocatorContext& ctx)
}
}
void DbgVerifyAllRanges(IMLRegisterAllocatorContext& ctx)
static void DbgVerifyFixedRegRequirements(IMLSegment* imlSegment)
{
#if DEBUG_RA_EXTRA_VALIDATION
std::vector<raFixedRegRequirementWithVGPR> frr = IMLRA_BuildSegmentInstructionFixedRegList(imlSegment);
for(auto& fixedReq : frr)
{
for (raLivenessRange* range = imlSegment->raInfo.linkedList_allSubranges; range; range = range->link_allSegmentRanges.next)
{
if (!range->interval2.ContainsEdge(fixedReq.pos))
continue;
// verify if the requirement is compatible
if(range->GetVirtualRegister() == fixedReq.regId)
{
cemu_assert(range->HasPhysicalRegister());
cemu_assert(fixedReq.allowedReg.IsAvailable(range->GetPhysicalRegister())); // virtual register matches, but not assigned the right physical register
}
else
{
cemu_assert(!fixedReq.allowedReg.IsAvailable(range->GetPhysicalRegister())); // virtual register does not match, but using the reserved physical register
}
}
}
#endif
}
static void DbgVerifyAllRanges(IMLRegisterAllocatorContext& ctx)
{
#if DEBUG_RA_EXTRA_VALIDATION
for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++)
{
IMLSegment* imlSegment = ctx.deprGenContext->segmentList2[s];
@ -2107,6 +2162,12 @@ void DbgVerifyAllRanges(IMLRegisterAllocatorContext& ctx)
subrangeItr = subrangeItr->link_allSegmentRanges.next;
}
}
// check that no range validates register requirements
for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++)
{
DbgVerifyFixedRegRequirements(ctx.deprGenContext->segmentList2[s]);
}
#endif
}
void IMLRegisterAllocator_AllocateRegisters(ppcImlGenContext_t* ppcImlGenContext, IMLRegisterAllocatorParameters& raParam)
@ -2121,7 +2182,7 @@ void IMLRegisterAllocator_AllocateRegisters(ppcImlGenContext_t* ppcImlGenContext
IMLRA_CalculateLivenessRanges(ctx);
IMLRA_ProcessFlowAndCalculateLivenessRanges(ctx);
IMLRA_AssignRegisters(ctx, ppcImlGenContext);
DbgVerifyAllRanges(ctx); // DEBUG
DbgVerifyAllRanges(ctx);
IMLRA_AnalyzeRangeDataFlow(ppcImlGenContext);
IMLRA_GenerateMoveInstructions(ctx);

View file

@ -17,9 +17,19 @@ public:
m_regBitmask &= ~((uint64)1 << index);
}
void SetAllAvailable()
{
m_regBitmask = ~0ull;
}
bool HasAllAvailable() const
{
return m_regBitmask == ~0ull;
}
bool IsAvailable(uint32 index) const
{
return (m_regBitmask & (1 << index)) != 0;
return (m_regBitmask & ((uint64)1 << index)) != 0;
}
IMLPhysRegisterSet& operator&=(const IMLPhysRegisterSet& other)

View file

@ -67,38 +67,30 @@ boost::container::small_vector<raLivenessRange*, 128> raLivenessRange::GetAllSub
return subranges;
}
void raLivenessRange::GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs)
{
range->lastIterationIndex = iterationIndex;
for (auto& it : range->list_fixedRegRequirements)
allowedRegs &= it.allowedReg;
// check successors
if (range->subrangeBranchTaken && range->subrangeBranchTaken->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(range->subrangeBranchTaken, iterationIndex, allowedRegs);
if (range->subrangeBranchNotTaken && range->subrangeBranchNotTaken->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(range->subrangeBranchNotTaken, iterationIndex, allowedRegs);
// check predecessors
for (auto& prev : range->previousRanges)
{
if (prev->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(prev, iterationIndex, allowedRegs);
}
};
bool raLivenessRange::GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters)
{
if(interval2.ExtendsPreviousSegment() || interval2.ExtendsIntoNextSegment())
{
auto clusterRanges = GetAllSubrangesInCluster();
bool hasAnyRequirement = false;
for(auto& subrange : clusterRanges)
{
if(subrange->list_fixedRegRequirements.empty())
continue;
allowedRegisters = subrange->list_fixedRegRequirements.front().allowedReg;
hasAnyRequirement = true;
break;
}
if(!hasAnyRequirement)
return false;
for(auto& subrange : clusterRanges)
{
for(auto& fixedRegLoc : subrange->list_fixedRegRequirements)
allowedRegisters &= fixedRegLoc.allowedReg;
}
}
else
{
// local check only, slightly faster
if(list_fixedRegRequirements.empty())
return false;
allowedRegisters = list_fixedRegRequirements.front().allowedReg;
for(auto& fixedRegLoc : list_fixedRegRequirements)
allowedRegisters &= fixedRegLoc.allowedReg;
}
return true;
uint32 iterationIndex = PPCRecRA_getNextIterationIndex();
allowedRegisters.SetAllAvailable();
GetAllowedRegistersExRecursive(this, iterationIndex, allowedRegisters);
return !allowedRegisters.HasAllAvailable();
}
IMLPhysRegisterSet raLivenessRange::GetAllowedRegisters(IMLPhysRegisterSet regPool)
@ -424,6 +416,14 @@ void PPCRecRA_debugValidateSubrange(raLivenessRange* range)
cemu_assert_debug(range->list_locations.front().index >= range->interval2.start.GetInstructionIndexEx());
cemu_assert_debug(range->list_locations.back().index <= range->interval2.end.GetInstructionIndexEx());
}
// validate fixed reg requirements
if (!range->list_fixedRegRequirements.empty())
{
cemu_assert_debug(range->list_fixedRegRequirements.front().pos >= range->interval2.start);
cemu_assert_debug(range->list_fixedRegRequirements.back().pos <= range->interval2.end);
for(sint32 i = 0; i < (sint32)range->list_fixedRegRequirements.size()-1; i++)
cemu_assert_debug(range->list_fixedRegRequirements[i].pos < range->list_fixedRegRequirements[i+1].pos);
}
}
#else
@ -563,7 +563,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
{
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
if (tailInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex()))
if (tailInterval.ContainsEdge(fixedReg->pos))
{
tailSubrange->list_fixedRegRequirements.push_back(*fixedReg);
}
@ -572,7 +572,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
{
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
if (!headInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex()))
if (!headInterval.ContainsEdge(fixedReg->pos))
{
subrange->list_fixedRegRequirements.resize(i);
break;

View file

@ -335,6 +335,9 @@ struct raLivenessRange
void SetPhysicalRegister(sint32 physicalRegister);
void SetPhysicalRegisterForCluster(sint32 physicalRegister);
void UnsetPhysicalRegister() { physicalRegister = -1; }
private:
void GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs);
};
raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition);

View file

@ -181,9 +181,6 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
}
}
// if(range.startAddress < 0x0202fa3C || range.startAddress > 0x0202FA7C)
// return nullptr; // DEBUG
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length;
@ -340,15 +337,6 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
//PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
//PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
// if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8 && ppcImlGenContext.debug_entryPPCAddress < 0x0240C0AC)
// {
// IMLDebug_Dump(&ppcImlGenContext);
// __debugbreak();
// }
// else if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8)
// return false;
return true;
}

View file

@ -1513,7 +1513,7 @@ bool PPCRecompilerImlGen_DCBZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
ppcImlGenContext->emitInst().make_r_r_r(PPCREC_IML_OP_ADD, regMemResEA, regA, regB);
else
ppcImlGenContext->emitInst().make_r_r(PPCREC_IML_OP_ASSIGN, regMemResEA, regB);
ppcImlGenContext->emitInst().make_r_s32(PPCREC_IML_OP_AND, regMemResEA, ~31);
ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, regMemResEA, regMemResEA, ~31);
// zero out the cacheline
for(sint32 i = 0; i < 32; i += 4)
ppcImlGenContext->emitInst().make_memory_r(regZero, regMemResEA, i, 32, false);

View file

@ -4,6 +4,9 @@
#include "PPCRecompilerIml.h"
#include "Cafe/GameProfile/GameProfile.h"
ATTR_MS_ABI double frsqrte_espresso(double input);
ATTR_MS_ABI double fres_espresso(double input);
IMLReg _GetRegCR(ppcImlGenContext_t* ppcImlGenContext, uint8 crReg, uint8 crBit);
void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
@ -1007,9 +1010,12 @@ bool PPCRecompilerImlGen_FRES(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
// load registers
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterB);
ppcImlGenContext->emitInst().make_call_imm((uintptr_t)fres_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
// adjust accuracy
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
// copy result to top
if( ppcImlGenContext->PSE )
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
return true;
}
@ -1026,9 +1032,7 @@ bool PPCRecompilerImlGen_FRSP(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
}
PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext, NULL,PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, fprRegisterD);
if( ppcImlGenContext->PSE )
{
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
}
return true;
}
@ -1075,7 +1079,7 @@ bool PPCRecompilerImlGen_FRSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 op
// hCPU->fpr[frD].fpr = 1.0 / sqrt(hCPU->fpr[frB].fpr);
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT, fprRegisterD, fprRegisterB);
ppcImlGenContext->emitInst().make_call_imm((uintptr_t)frsqrte_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
// adjust accuracy
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
return true;