PPCRec: Fixes and optimizations + rework FRES/FRSQRTE

This commit is contained in:
Exzap 2024-10-23 08:36:12 +02:00
parent 89f8f9bd2a
commit f94f99546d
13 changed files with 408 additions and 354 deletions

View file

@ -32,7 +32,7 @@ espresso_frsqrte_entry_t frsqrteLookupTable[32] =
{0x20c1000, 0x35e},{0x1f12000, 0x332},{0x1d79000, 0x30a},{0x1bf4000, 0x2e6}, {0x20c1000, 0x35e},{0x1f12000, 0x332},{0x1d79000, 0x30a},{0x1bf4000, 0x2e6},
}; };
double frsqrte_espresso(double input) ATTR_MS_ABI double frsqrte_espresso(double input)
{ {
unsigned long long x = *(unsigned long long*)&input; unsigned long long x = *(unsigned long long*)&input;
@ -111,7 +111,7 @@ espresso_fres_entry_t fresLookupTable[32] =
{0x88400, 0x11a}, {0x65000, 0x11a}, {0x41c00, 0x108}, {0x20c00, 0x106} {0x88400, 0x11a}, {0x65000, 0x11a}, {0x41c00, 0x108}, {0x20c00, 0x106}
}; };
double fres_espresso(double input) ATTR_MS_ABI double fres_espresso(double input)
{ {
// based on testing we know that fres uses only the first 15 bits of the mantissa // based on testing we know that fres uses only the first 15 bits of the mantissa
// seee eeee eeee mmmm mmmm mmmm mmmx xxxx .... (s = sign, e = exponent, m = mantissa, x = not used) // seee eeee eeee mmmm mmmm mmmm mmmx xxxx .... (s = sign, e = exponent, m = mantissa, x = not used)

View file

@ -191,8 +191,8 @@ inline double roundTo25BitAccuracy(double d)
return *(double*)&v; return *(double*)&v;
} }
double fres_espresso(double input); ATTR_MS_ABI double fres_espresso(double input);
double frsqrte_espresso(double input); ATTR_MS_ABI double frsqrte_espresso(double input);
void fcmpu_espresso(PPCInterpreter_t* hCPU, int crfD, double a, double b); void fcmpu_espresso(PPCInterpreter_t* hCPU, int crfD, double a, double b);

View file

@ -601,8 +601,10 @@ void PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRe
void PPCRecompilerX64Gen_imlInstruction_call_imm(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) void PPCRecompilerX64Gen_imlInstruction_call_imm(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
{ {
// the register allocator takes care of spilling volatile registers and moving parameters to the right registers, so we don't need to do any special handling here // the register allocator takes care of spilling volatile registers and moving parameters to the right registers, so we don't need to do any special handling here
x64GenContext->emitter->SUB_qi8(X86_REG_RSP, 0x28); // reserve enough space for any parameters while keeping stack alignment of 16 intact
x64GenContext->emitter->MOV_qi64(X86_REG_RAX, imlInstruction->op_call_imm.callAddress); x64GenContext->emitter->MOV_qi64(X86_REG_RAX, imlInstruction->op_call_imm.callAddress);
x64GenContext->emitter->CALL_q(X86_REG_RAX); x64GenContext->emitter->CALL_q(X86_REG_RAX);
x64GenContext->emitter->ADD_qi8(X86_REG_RSP, 0x28);
} }
bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)

View file

@ -780,18 +780,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
// move to FPR register // move to FPR register
x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP); x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP);
} }
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP )
{
// move register to XMM15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// call assembly routine to calculate accurate FRES result in XMM15
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_fres);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP);
// copy result to bottom and top half of result register
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT) else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT)
{ {
// move register to XMM15 // move register to XMM15

View file

@ -363,7 +363,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP || operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED || operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED ||
operation == PPCREC_IML_OP_ASSIGN || operation == PPCREC_IML_OP_ASSIGN ||
operation == PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_NEGATE_PAIR || operation == PPCREC_IML_OP_FPR_NEGATE_PAIR ||
operation == PPCREC_IML_OP_FPR_ABS_PAIR || operation == PPCREC_IML_OP_FPR_ABS_PAIR ||
operation == PPCREC_IML_OP_FPR_FRES_PAIR || operation == PPCREC_IML_OP_FPR_FRES_PAIR ||

View file

@ -143,7 +143,6 @@ enum
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED, PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, // calculate reciprocal with Espresso accuracy of source bottom half and write result to destination bottom and top half
PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated

File diff suppressed because it is too large Load diff

View file

@ -17,9 +17,19 @@ public:
m_regBitmask &= ~((uint64)1 << index); m_regBitmask &= ~((uint64)1 << index);
} }
void SetAllAvailable()
{
m_regBitmask = ~0ull;
}
bool HasAllAvailable() const
{
return m_regBitmask == ~0ull;
}
bool IsAvailable(uint32 index) const bool IsAvailable(uint32 index) const
{ {
return (m_regBitmask & (1 << index)) != 0; return (m_regBitmask & ((uint64)1 << index)) != 0;
} }
IMLPhysRegisterSet& operator&=(const IMLPhysRegisterSet& other) IMLPhysRegisterSet& operator&=(const IMLPhysRegisterSet& other)

View file

@ -67,38 +67,30 @@ boost::container::small_vector<raLivenessRange*, 128> raLivenessRange::GetAllSub
return subranges; return subranges;
} }
void raLivenessRange::GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs)
{
range->lastIterationIndex = iterationIndex;
for (auto& it : range->list_fixedRegRequirements)
allowedRegs &= it.allowedReg;
// check successors
if (range->subrangeBranchTaken && range->subrangeBranchTaken->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(range->subrangeBranchTaken, iterationIndex, allowedRegs);
if (range->subrangeBranchNotTaken && range->subrangeBranchNotTaken->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(range->subrangeBranchNotTaken, iterationIndex, allowedRegs);
// check predecessors
for (auto& prev : range->previousRanges)
{
if (prev->lastIterationIndex != iterationIndex)
GetAllowedRegistersExRecursive(prev, iterationIndex, allowedRegs);
}
};
bool raLivenessRange::GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters) bool raLivenessRange::GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters)
{ {
if(interval2.ExtendsPreviousSegment() || interval2.ExtendsIntoNextSegment()) uint32 iterationIndex = PPCRecRA_getNextIterationIndex();
{ allowedRegisters.SetAllAvailable();
auto clusterRanges = GetAllSubrangesInCluster(); GetAllowedRegistersExRecursive(this, iterationIndex, allowedRegisters);
bool hasAnyRequirement = false; return !allowedRegisters.HasAllAvailable();
for(auto& subrange : clusterRanges)
{
if(subrange->list_fixedRegRequirements.empty())
continue;
allowedRegisters = subrange->list_fixedRegRequirements.front().allowedReg;
hasAnyRequirement = true;
break;
}
if(!hasAnyRequirement)
return false;
for(auto& subrange : clusterRanges)
{
for(auto& fixedRegLoc : subrange->list_fixedRegRequirements)
allowedRegisters &= fixedRegLoc.allowedReg;
}
}
else
{
// local check only, slightly faster
if(list_fixedRegRequirements.empty())
return false;
allowedRegisters = list_fixedRegRequirements.front().allowedReg;
for(auto& fixedRegLoc : list_fixedRegRequirements)
allowedRegisters &= fixedRegLoc.allowedReg;
}
return true;
} }
IMLPhysRegisterSet raLivenessRange::GetAllowedRegisters(IMLPhysRegisterSet regPool) IMLPhysRegisterSet raLivenessRange::GetAllowedRegisters(IMLPhysRegisterSet regPool)
@ -424,6 +416,14 @@ void PPCRecRA_debugValidateSubrange(raLivenessRange* range)
cemu_assert_debug(range->list_locations.front().index >= range->interval2.start.GetInstructionIndexEx()); cemu_assert_debug(range->list_locations.front().index >= range->interval2.start.GetInstructionIndexEx());
cemu_assert_debug(range->list_locations.back().index <= range->interval2.end.GetInstructionIndexEx()); cemu_assert_debug(range->list_locations.back().index <= range->interval2.end.GetInstructionIndexEx());
} }
// validate fixed reg requirements
if (!range->list_fixedRegRequirements.empty())
{
cemu_assert_debug(range->list_fixedRegRequirements.front().pos >= range->interval2.start);
cemu_assert_debug(range->list_fixedRegRequirements.back().pos <= range->interval2.end);
for(sint32 i = 0; i < (sint32)range->list_fixedRegRequirements.size()-1; i++)
cemu_assert_debug(range->list_fixedRegRequirements[i].pos < range->list_fixedRegRequirements[i+1].pos);
}
} }
#else #else
@ -563,7 +563,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++) for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
{ {
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i; raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
if (tailInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex())) if (tailInterval.ContainsEdge(fixedReg->pos))
{ {
tailSubrange->list_fixedRegRequirements.push_back(*fixedReg); tailSubrange->list_fixedRegRequirements.push_back(*fixedReg);
} }
@ -572,7 +572,7 @@ raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenConte
for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++) for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++)
{ {
raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i; raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i;
if (!headInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex())) if (!headInterval.ContainsEdge(fixedReg->pos))
{ {
subrange->list_fixedRegRequirements.resize(i); subrange->list_fixedRegRequirements.resize(i);
break; break;

View file

@ -335,6 +335,9 @@ struct raLivenessRange
void SetPhysicalRegister(sint32 physicalRegister); void SetPhysicalRegister(sint32 physicalRegister);
void SetPhysicalRegisterForCluster(sint32 physicalRegister); void SetPhysicalRegisterForCluster(sint32 physicalRegister);
void UnsetPhysicalRegister() { physicalRegister = -1; } void UnsetPhysicalRegister() { physicalRegister = -1; }
private:
void GetAllowedRegistersExRecursive(raLivenessRange* range, uint32 iterationIndex, IMLPhysRegisterSet& allowedRegs);
}; };
raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition); raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition);

View file

@ -181,9 +181,6 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
} }
} }
// if(range.startAddress < 0x0202fa3C || range.startAddress > 0x0202FA7C)
// return nullptr; // DEBUG
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t(); PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress; ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length; ppcRecFunc->ppcSize = range.length;
@ -340,15 +337,6 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
//PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext); //PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext);
//PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext); //PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext);
// if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8 && ppcImlGenContext.debug_entryPPCAddress < 0x0240C0AC)
// {
// IMLDebug_Dump(&ppcImlGenContext);
// __debugbreak();
// }
// else if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8)
// return false;
return true; return true;
} }

View file

@ -1513,7 +1513,7 @@ bool PPCRecompilerImlGen_DCBZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
ppcImlGenContext->emitInst().make_r_r_r(PPCREC_IML_OP_ADD, regMemResEA, regA, regB); ppcImlGenContext->emitInst().make_r_r_r(PPCREC_IML_OP_ADD, regMemResEA, regA, regB);
else else
ppcImlGenContext->emitInst().make_r_r(PPCREC_IML_OP_ASSIGN, regMemResEA, regB); ppcImlGenContext->emitInst().make_r_r(PPCREC_IML_OP_ASSIGN, regMemResEA, regB);
ppcImlGenContext->emitInst().make_r_s32(PPCREC_IML_OP_AND, regMemResEA, ~31); ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, regMemResEA, regMemResEA, ~31);
// zero out the cacheline // zero out the cacheline
for(sint32 i = 0; i < 32; i += 4) for(sint32 i = 0; i < 32; i += 4)
ppcImlGenContext->emitInst().make_memory_r(regZero, regMemResEA, i, 32, false); ppcImlGenContext->emitInst().make_memory_r(regZero, regMemResEA, i, 32, false);

View file

@ -4,6 +4,9 @@
#include "PPCRecompilerIml.h" #include "PPCRecompilerIml.h"
#include "Cafe/GameProfile/GameProfile.h" #include "Cafe/GameProfile/GameProfile.h"
ATTR_MS_ABI double frsqrte_espresso(double input);
ATTR_MS_ABI double fres_espresso(double input);
IMLReg _GetRegCR(ppcImlGenContext_t* ppcImlGenContext, uint8 crReg, uint8 crBit); IMLReg _GetRegCR(ppcImlGenContext_t* ppcImlGenContext, uint8 crReg, uint8 crBit);
void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID) void PPCRecompilerImlGen_generateNewInstruction_fpr_r_memory(ppcImlGenContext_t* ppcImlGenContext, IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
@ -1007,9 +1010,12 @@ bool PPCRecompilerImlGen_FRES(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
// load registers // load registers
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB); IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD); IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterB); ppcImlGenContext->emitInst().make_call_imm((uintptr_t)fres_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
// adjust accuracy // adjust accuracy
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD); PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
// copy result to top
if( ppcImlGenContext->PSE )
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
return true; return true;
} }
@ -1026,9 +1032,7 @@ bool PPCRecompilerImlGen_FRSP(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
} }
PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext, NULL,PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, fprRegisterD); PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext, NULL,PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, fprRegisterD);
if( ppcImlGenContext->PSE ) if( ppcImlGenContext->PSE )
{
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD); PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP, fprRegisterD, fprRegisterD);
}
return true; return true;
} }
@ -1075,7 +1079,7 @@ bool PPCRecompilerImlGen_FRSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 op
// hCPU->fpr[frD].fpr = 1.0 / sqrt(hCPU->fpr[frB].fpr); // hCPU->fpr[frD].fpr = 1.0 / sqrt(hCPU->fpr[frB].fpr);
IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB); IMLReg fprRegisterB = PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frB);
IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD); IMLReg fprRegisterD = PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext, PPCREC_NAME_FPR0+frD);
PPCRecompilerImlGen_generateNewInstruction_fpr_r_r(ppcImlGenContext, PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT, fprRegisterD, fprRegisterB); ppcImlGenContext->emitInst().make_call_imm((uintptr_t)frsqrte_espresso, fprRegisterB, IMLREG_INVALID, IMLREG_INVALID, fprRegisterD);
// adjust accuracy // adjust accuracy
PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD); PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext, fprRegisterD);
return true; return true;