diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IML.h b/src/Cafe/HW/Espresso/Recompiler/IML/IML.h index 98c48a84..bc0c27c5 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IML.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IML.h @@ -3,9 +3,6 @@ #include "IMLInstruction.h" #include "IMLSegment.h" -// analyzer -bool IMLAnalyzer_IsTightFiniteLoop(IMLSegment* imlSegment); - // optimizer passes void IMLOptimizer_OptimizeDirectFloatCopies(struct ppcImlGenContext_t* ppcImlGenContext); void IMLOptimizer_OptimizeDirectIntegerCopies(struct ppcImlGenContext_t* ppcImlGenContext); diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp index 77403e1b..6ae4b591 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLAnalyzer.cpp @@ -3,53 +3,3 @@ #include "util/helpers/fixedSizeList.h" #include "Cafe/HW/Espresso/Interpreter/PPCInterpreterInternal.h" - -/* - * Analyzes a single segment and returns true if it is a finite loop - */ -bool IMLAnalyzer_IsTightFiniteLoop(IMLSegment* imlSegment) -{ - return false; // !!! DISABLED !!! - - bool isTightFiniteLoop = false; - // base criteria, must jump to beginning of same segment - if (imlSegment->nextSegmentBranchTaken != imlSegment) - return false; - // loops using BDNZ are assumed to always be finite - for(const IMLInstruction& instIt : imlSegment->imlList) - { - if (instIt.type == PPCREC_IML_TYPE_R_S32 && instIt.operation == PPCREC_IML_OP_SUB) - { - return true; - } - } - // for non-BDNZ loops, check for common patterns - // risky approach, look for ADD/SUB operations and assume that potential overflow means finite (does not include r_r_s32 ADD/SUB) - // this catches most loops with load-update and store-update instructions, but also those with decrementing counters - FixedSizeList list_modifiedRegisters; - for (const IMLInstruction& instIt : imlSegment->imlList) - { - if (instIt.type == PPCREC_IML_TYPE_R_S32 && (instIt.operation == PPCREC_IML_OP_ADD || instIt.operation == PPCREC_IML_OP_SUB) ) - { - list_modifiedRegisters.addUnique(instIt.op_r_immS32.regR); - } - } - if (list_modifiedRegisters.count > 0) - { - // remove all registers from the list that are modified by non-ADD/SUB instructions - // todo: We should also cover the case where ADD+SUB on the same register cancel the effect out - IMLUsedRegisters registersUsed; - for (const IMLInstruction& instIt : imlSegment->imlList) - { - if (instIt.type == PPCREC_IML_TYPE_R_S32 && (instIt.operation == PPCREC_IML_OP_ADD || instIt.operation == PPCREC_IML_OP_SUB)) - continue; - instIt.CheckRegisterUsage(®istersUsed); - registersUsed.ForEachWrittenGPR([&](IMLReg r) { list_modifiedRegisters.remove(r); }); - } - if (list_modifiedRegisters.count > 0) - { - return true; - } - } - return false; -} \ No newline at end of file diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp index 4ce1ffd5..b75c389c 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp @@ -15,7 +15,6 @@ #define DEBUG_RA_EXTRA_VALIDATION 0 // if set to non-zero, additional expensive validation checks will be performed #define DEBUG_RA_INSTRUCTION_GEN 0 - struct IMLRARegAbstractLiveness // preliminary liveness info. One entry per register and segment { IMLRARegAbstractLiveness(IMLRegFormat regBaseFormat, sint32 usageStart, sint32 usageEnd) @@ -38,7 +37,7 @@ struct IMLRegisterAllocatorContext IMLRegisterAllocatorParameters* raParam; ppcImlGenContext_t* deprGenContext; // deprecated. Try to decouple IMLRA from other parts of IML/PPCRec - std::unordered_map regIdToBaseFormat; // a vector would be more efficient but it also means that reg ids have to be continuous and not completely arbitrary + std::unordered_map regIdToBaseFormat; // first pass std::vector> perSegmentAbstractRanges; @@ -781,11 +780,11 @@ class RASpillStrategy_LocalRangeHoleCutting : public RASpillStrategy cemu_assert_debug(currentRangeStart.IsInstructionIndex()); distance2 = std::min(distance2, imlSegment->imlList.size() * 2 - currentRangeStart.GetRaw()); // limit distance to end of segment // calculate split cost of candidate - sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(candidate, currentRangeStart + distance2); + sint32 cost = IMLRA_CalculateAdditionalCostAfterSplit(candidate, currentRangeStart + distance2); // calculate additional split cost of currentRange if hole is not large enough if (distance2 < requiredSize2) { - cost += PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance2); + cost += IMLRA_CalculateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance2); // we also slightly increase cost in relation to the remaining length (in order to make the algorithm prefer larger holes) cost += (requiredSize2 - distance2) / 10; } @@ -889,7 +888,7 @@ class RASpillStrategy_AvailableRegisterHole : public RASpillStrategy continue; // calculate additional cost due to split cemu_assert_debug(distance < requiredSize2); // should always be true otherwise previous step would have selected this register? - sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance); + sint32 cost = IMLRA_CalculateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance); // add small additional cost for the remaining range (prefer larger holes) cost += ((requiredSize2 - distance) / 2) / 10; if (cost < strategyCost) @@ -959,11 +958,11 @@ class RASpillStrategy_ExplodeRange : public RASpillStrategy IMLRA_MakeSafeSplitDistance(imlSegment, currentRangeStart, distance); if (distance < 2) continue; - sint32 cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate); + sint32 cost = IMLRA_CalculateAdditionalCostOfRangeExplode(candidate); // if the hole is not large enough, add cost of splitting current subrange if (distance < requiredSize2) { - cost += PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance); + cost += IMLRA_CalculateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance); // add small additional cost for the remaining range (prefer larger holes) cost += ((requiredSize2 - distance) / 2) / 10; } @@ -1032,7 +1031,7 @@ class RASpillStrategy_ExplodeRangeInter : public RASpillStrategy if (!allowedRegs.IsAvailable(candidate->GetPhysicalRegister())) continue; sint32 cost; - cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate); + cost = IMLRA_CalculateAdditionalCostOfRangeExplode(candidate); // compare with current best candidate for this strategy if (cost < strategyCost) { @@ -1043,7 +1042,7 @@ class RASpillStrategy_ExplodeRangeInter : public RASpillStrategy } // add current range as a candidate too sint32 ownCost; - ownCost = PPCRecRARange_estimateCostAfterRangeExplode(currentRange); + ownCost = IMLRA_CalculateAdditionalCostOfRangeExplode(currentRange); if (ownCost < strategyCost) { strategyCost = ownCost; @@ -1859,7 +1858,7 @@ static void IMLRA_AnalyzeRangeDataFlow(raLivenessRange* subrange) if (subrangeItr->hasStore) continue; // this ending already stores, no extra cost alreadyStoredInAllEndings = false; - sint32 storeCost = PPCRecRARange_getReadWriteCost(subrangeItr->imlSegment); + sint32 storeCost = IMLRA_GetSegmentReadWriteCost(subrangeItr->imlSegment); delayStoreCost = std::max(storeCost, delayStoreCost); } if (alreadyStoredInAllEndings) @@ -1867,7 +1866,7 @@ static void IMLRA_AnalyzeRangeDataFlow(raLivenessRange* subrange) subrange->hasStore = false; subrange->hasStoreDelayed = true; } - else if (delayStoreCost <= PPCRecRARange_getReadWriteCost(subrange->imlSegment)) + else if (delayStoreCost <= IMLRA_GetSegmentReadWriteCost(subrange->imlSegment)) { subrange->hasStore = false; subrange->hasStoreDelayed = true; diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp index 45d01608..2f4581ee 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp @@ -642,7 +642,7 @@ void PPCRecRA_updateOrAddSubrangeLocation(raLivenessRange* subrange, sint32 inde subrange->list_locations.emplace_back(index, isRead, isWrite); } -sint32 PPCRecRARange_getReadWriteCost(IMLSegment* imlSegment) +sint32 IMLRA_GetSegmentReadWriteCost(IMLSegment* imlSegment) { sint32 v = imlSegment->loopDepth + 1; v *= 5; @@ -668,13 +668,13 @@ sint32 PPCRecRARange_estimateTotalCost(std::span ranges) if (!subrange->interval2.ExtendsPreviousSegment()) { //cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment); - mostExpensiveRead = std::max(mostExpensiveRead, PPCRecRARange_getReadWriteCost(subrange->imlSegment)); + mostExpensiveRead = std::max(mostExpensiveRead, IMLRA_GetSegmentReadWriteCost(subrange->imlSegment)); readCount++; } if (!subrange->interval2.ExtendsIntoNextSegment()) { //cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment); - mostExpensiveWrite = std::max(mostExpensiveWrite, PPCRecRARange_getReadWriteCost(subrange->imlSegment)); + mostExpensiveWrite = std::max(mostExpensiveWrite, IMLRA_GetSegmentReadWriteCost(subrange->imlSegment)); writeCount++; } } @@ -683,21 +683,34 @@ sint32 PPCRecRARange_estimateTotalCost(std::span ranges) return cost; } -// calculate cost of range that it would have after calling PPCRecRA_explodeRange() on it -sint32 PPCRecRARange_estimateCostAfterRangeExplode(raLivenessRange* subrange) +// calculate additional cost of range that it would have after calling _ExplodeRange() on it +sint32 IMLRA_CalculateAdditionalCostOfRangeExplode(raLivenessRange* subrange) { auto ranges = subrange->GetAllSubrangesInCluster(); - sint32 cost = -PPCRecRARange_estimateTotalCost(ranges); + sint32 cost = 0;//-PPCRecRARange_estimateTotalCost(ranges); for (auto& subrange : ranges) { if (subrange->list_locations.empty()) - continue; - cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // we assume a read and a store + continue; // this range would be deleted and thus has no cost + sint32 segmentLoadStoreCost = IMLRA_GetSegmentReadWriteCost(subrange->imlSegment); + bool hasAdditionalLoad = subrange->interval2.ExtendsPreviousSegment(); + bool hasAdditionalStore = subrange->interval2.ExtendsIntoNextSegment(); + if(hasAdditionalLoad && !subrange->list_locations.front().isRead && subrange->list_locations.front().isWrite) // if written before read, then a load isn't necessary + { + cost += segmentLoadStoreCost; + } + if(hasAdditionalStore) + { + bool hasWrite = std::find_if(subrange->list_locations.begin(), subrange->list_locations.end(), [](const raLivenessLocation_t& loc) { return loc.isWrite; }) != subrange->list_locations.end(); + if(!hasWrite) // ranges which don't modify their value do not need to be stored + cost += segmentLoadStoreCost; + } } + // todo - properly calculating all the data-flow dependency based costs is more complex so this currently is an approximation return cost; } -sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition) +sint32 IMLRA_CalculateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition) { // validation #ifdef CEMU_DEBUG_ASSERT @@ -719,9 +732,53 @@ sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, if (splitInstructionIndex > subrange->list_locations.back().index) return 0; - // todo - determine exact cost of split subranges + // this can be optimized, but we should change list_locations to track instruction edges instead of instruction indices + std::vector headLocations; + std::vector tailLocations; + for (auto& location : subrange->list_locations) + { + if(location.GetReadPos() < splitPosition || location.GetWritePos() < splitPosition) + headLocations.push_back(location); + if(location.GetReadPos() >= splitPosition || location.GetWritePos() >= splitPosition) + tailLocations.push_back(location); + } + // fixup locations + if(!headLocations.empty() && headLocations.back().GetWritePos() >= splitPosition) + { + headLocations.back().isWrite = false; + if(!headLocations.back().isRead && !headLocations.back().isWrite) + headLocations.pop_back(); + } + if(!tailLocations.empty() && tailLocations.front().GetReadPos() < splitPosition) + { + tailLocations.front().isRead = false; + if(!tailLocations.front().isRead && !tailLocations.front().isWrite) + tailLocations.erase(tailLocations.begin()); + } - cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // currently we assume that the additional region will require a read and a store + // based on + sint32 segmentLoadStoreCost = IMLRA_GetSegmentReadWriteCost(subrange->imlSegment); + + auto CalculateCostFromLocationRange = [segmentLoadStoreCost](const std::vector& locations, bool trackLoadCost = true, bool trackStoreCost = true) -> sint32 + { + if(locations.empty()) + return 0; + sint32 cost = 0; + if(locations.front().isRead && trackLoadCost) + cost += segmentLoadStoreCost; // not overwritten, so there is a load cost + bool hasWrite = std::find_if(locations.begin(), locations.end(), [](const raLivenessLocation_t& loc) { return loc.isWrite; }) != locations.end(); + if(hasWrite && trackStoreCost) + cost += segmentLoadStoreCost; // modified, so there is a store cost + return cost; + }; + + sint32 baseCost = CalculateCostFromLocationRange(subrange->list_locations); + + bool tailOverwritesValue = !tailLocations.empty() && !tailLocations.front().isRead && tailLocations.front().isWrite; + + sint32 newCost = CalculateCostFromLocationRange(headLocations) + CalculateCostFromLocationRange(tailLocations, !tailOverwritesValue, true); + cemu_assert_debug(newCost >= baseCost); + cost = newCost - baseCost; return cost; } \ No newline at end of file diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h index 4d928a26..5173031e 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h @@ -1,18 +1,6 @@ #pragma once #include "IMLRegisterAllocator.h" -struct raLivenessLocation_t -{ - sint32 index; - bool isRead; - bool isWrite; - - raLivenessLocation_t() = default; - - raLivenessLocation_t(sint32 index, bool isRead, bool isWrite) - : index(index), isRead(isRead), isWrite(isWrite) {}; -}; - struct raLivenessSubrangeLink { struct raLivenessRange* prev; @@ -167,6 +155,28 @@ private: }; +struct raLivenessLocation_t +{ + sint32 index; + bool isRead; + bool isWrite; + + raLivenessLocation_t() = default; + + raLivenessLocation_t(sint32 index, bool isRead, bool isWrite) + : index(index), isRead(isRead), isWrite(isWrite) {}; + + raInstructionEdge GetReadPos() + { + return raInstructionEdge(index, true); + } + + raInstructionEdge GetWritePos() + { + return raInstructionEdge(index, false); + } +}; + struct raInterval { raInterval() @@ -354,7 +364,7 @@ void PPCRecRA_updateOrAddSubrangeLocation(raLivenessRange* subrange, sint32 inde void PPCRecRA_debugValidateSubrange(raLivenessRange* subrange); // cost estimation -sint32 PPCRecRARange_getReadWriteCost(IMLSegment* imlSegment); -sint32 PPCRecRARange_estimateCostAfterRangeExplode(raLivenessRange* subrange); +sint32 IMLRA_GetSegmentReadWriteCost(IMLSegment* imlSegment); +sint32 IMLRA_CalculateAdditionalCostOfRangeExplode(raLivenessRange* subrange); //sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, sint32 splitIndex); -sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition); \ No newline at end of file +sint32 IMLRA_CalculateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition); \ No newline at end of file