diff --git a/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64.cpp b/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64.cpp index aca6e988..8305aa19 100644 --- a/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64.cpp @@ -579,31 +579,23 @@ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction, return true; } -bool PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) +void PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regBoolOut = _reg32_from_reg8(_reg8(imlInstruction->op_atomic_compare_store.regBoolOut)); auto regEA = _reg32(imlInstruction->op_atomic_compare_store.regEA); auto regVal = _reg32(imlInstruction->op_atomic_compare_store.regWriteValue); auto regCmp = _reg32(imlInstruction->op_atomic_compare_store.regCompareValue); - // make sure non of the regs are in EAX - if (regEA == X86_REG_EAX || - regBoolOut == X86_REG_EAX || - regVal == X86_REG_EAX || - regCmp == X86_REG_EAX) - { - printf("x86: atomic_cmp_store cannot emit due to EAX already being in use\n"); - return false; - } + cemu_assert_debug(regBoolOut == X86_REG_EAX); + cemu_assert_debug(regEA != X86_REG_EAX); + cemu_assert_debug(regVal != X86_REG_EAX); + cemu_assert_debug(regCmp != X86_REG_EAX); - x64GenContext->emitter->XCHG_qq(REG_RESV_TEMP, X86_REG_RAX); x64GenContext->emitter->MOV_dd(X86_REG_EAX, regCmp); - x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regBoolOut), _reg32_from_reg8(regBoolOut)); // zero bytes unaffected by SETcc x64GenContext->emitter->LockPrefix(); x64GenContext->emitter->CMPXCHG_dd_l(REG_RESV_MEMBASE, 0, _reg64_from_reg32(regEA), 1, regVal); x64GenContext->emitter->SETcc_b(X86Cond::X86_CONDITION_Z, regBoolOut); - x64GenContext->emitter->XCHG_qq(REG_RESV_TEMP, X86_REG_RAX); - return true; + x64GenContext->emitter->AND_di32(regBoolOut, 1); // SETcc doesn't clear the upper bits so we do it manually here } bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) @@ -908,78 +900,29 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction, imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U || imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) { - // x86's shift and rotate instruction have the shift amount hardwired to the CL register - // since our register allocator doesn't support instruction based fixed phys registers yet - // we'll instead have to temporarily shuffle registers around - - // we use BMI2's shift instructions until the RA can assign fixed registers - if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) + if(g_CPUFeatures.x86.bmi2) { - x64Gen_sarx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); + if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) + x64Gen_sarx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); + else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) + x64Gen_shrx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); + else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) + x64Gen_shlx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } - else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) + else { - x64Gen_shrx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); + cemu_assert_debug(rRegResult != rRegOperand2); + cemu_assert_debug(rRegResult != X86_REG_RCX); + cemu_assert_debug(rRegOperand2 == X86_REG_RCX); + if(rRegOperand1 != rRegResult) + x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); + if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) + x64GenContext->emitter->SAR_d_CL(rRegResult); + else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) + x64GenContext->emitter->SHR_d_CL(rRegResult); + else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) + x64GenContext->emitter->SHL_d_CL(rRegResult); } - else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) - { - x64Gen_shlx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); - } - - //auto rResult = _reg32(rRegResult); - //auto rOp2 = _reg8_from_reg32(_reg32(rRegOperand2)); - - //if (rRegResult == rRegOperand2) - //{ - // if (rRegResult != rRegOperand1) - // DEBUG_BREAK; // cannot handle yet (we use rRegResult as a temporary reg, but its not possible if it is shared with op2) - //} - - //if(rRegOperand1 != rRegResult) - // x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); - - //cemu_assert_debug(rRegOperand1 != X86_REG_ECX); - - //if (rRegOperand2 == X86_REG_ECX) - //{ - // if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) - // x64GenContext->emitter->SAR_d_CL(rResult); - // else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) - // x64GenContext->emitter->SHR_d_CL(rResult); - // else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) - // x64GenContext->emitter->SHL_d_CL(rResult); - // else - // cemu_assert_unimplemented(); - //} - //else - //{ - // auto rRegResultOrg = rRegResult; - // if (rRegResult == X86_REG_ECX) - // { - // x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegResult); - // rRegResult = REG_RESV_TEMP; - // rResult = _reg32(rRegResult); - // } - // - // x64Gen_xchg_reg64_reg64(x64GenContext, X86_REG_RCX, rRegOperand2); - // - // if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) - // x64GenContext->emitter->SAR_d_CL(rResult); - // else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) - // x64GenContext->emitter->SHR_d_CL(rResult); - // else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) - // x64GenContext->emitter->SHL_d_CL(rResult); - // else - // cemu_assert_unimplemented(); - - // x64Gen_xchg_reg64_reg64(x64GenContext, X86_REG_RCX, rRegOperand2); - - // // move result back if it was in ECX - // if (rRegResultOrg == X86_REG_ECX) - // { - // x64Gen_mov_reg64_reg64(x64GenContext, rRegResultOrg, REG_RESV_TEMP); - // } - //} } else if( imlInstruction->operation == PPCREC_IML_OP_DIVIDE_SIGNED || imlInstruction->operation == PPCREC_IML_OP_DIVIDE_UNSIGNED ) { @@ -1093,9 +1036,19 @@ bool PPCRecompilerX64Gen_imlInstruction_compare(PPCRecFunction_t* PPCRecFunction auto regA = _reg32(imlInstruction->op_compare.regA); auto regB = _reg32(imlInstruction->op_compare.regB); X86Cond cond = _x86Cond(imlInstruction->op_compare.cond); - x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regR), _reg32_from_reg8(regR)); // zero bytes unaffected by SETcc - x64GenContext->emitter->CMP_dd(regA, regB); - x64GenContext->emitter->SETcc_b(cond, regR); + bool keepR = regR == regA || regR == regB; + if(!keepR) + { + x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regR), _reg32_from_reg8(regR)); // zero bytes unaffected by SETcc + x64GenContext->emitter->CMP_dd(regA, regB); + x64GenContext->emitter->SETcc_b(cond, regR); + } + else + { + x64GenContext->emitter->CMP_dd(regA, regB); + x64GenContext->emitter->MOV_di32(_reg32_from_reg8(regR), 0); + x64GenContext->emitter->SETcc_b(cond, regR); + } return true; } @@ -1105,9 +1058,19 @@ bool PPCRecompilerX64Gen_imlInstruction_compare_s32(PPCRecFunction_t* PPCRecFunc auto regA = _reg32(imlInstruction->op_compare_s32.regA); sint32 imm = imlInstruction->op_compare_s32.immS32; X86Cond cond = _x86Cond(imlInstruction->op_compare_s32.cond); - x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regR), _reg32_from_reg8(regR)); // zero bytes unaffected by SETcc - x64GenContext->emitter->CMP_di32(regA, imm); - x64GenContext->emitter->SETcc_b(cond, regR); + bool keepR = regR == regA; + if(!keepR) + { + x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regR), _reg32_from_reg8(regR)); // zero bytes unaffected by SETcc + x64GenContext->emitter->CMP_di32(regA, imm); + x64GenContext->emitter->SETcc_b(cond, regR); + } + else + { + x64GenContext->emitter->CMP_di32(regA, imm); + x64GenContext->emitter->MOV_di32(_reg32_from_reg8(regR), 0); + x64GenContext->emitter->SETcc_b(cond, regR); + } return true; } @@ -1202,7 +1165,6 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_s32(PPCRecFunction_t* PPCRecFunction { if( regA != regR ) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); - if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) x64Gen_shl_reg64Low32_imm8(x64GenContext, regR, imlInstruction->op_r_r_s32.immS32); else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) @@ -1224,19 +1186,25 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_s32_carry(PPCRecFunction_t* PPCRecFu auto regA = _reg32(imlInstruction->op_r_r_s32_carry.regA); sint32 immS32 = imlInstruction->op_r_r_s32_carry.immS32; auto regCarry = _reg32(imlInstruction->op_r_r_s32_carry.regCarry); - cemu_assert_debug(regCarry != regR && regCarry != regA); + cemu_assert_debug(regCarry != regR); // we dont allow two different outputs sharing the same register + + bool delayCarryInit = regCarry == regA; switch (imlInstruction->operation) { case PPCREC_IML_OP_ADD: - x64GenContext->emitter->XOR_dd(regCarry, regCarry); + if(!delayCarryInit) + x64GenContext->emitter->XOR_dd(regCarry, regCarry); if (regR != regA) x64GenContext->emitter->MOV_dd(regR, regA); x64GenContext->emitter->ADD_di32(regR, immS32); + if(delayCarryInit) + x64GenContext->emitter->MOV_di32(regCarry, 0); x64GenContext->emitter->SETcc_b(X86_CONDITION_B, _reg8_from_reg32(regCarry)); break; case PPCREC_IML_OP_ADD_WITH_CARRY: // assumes that carry is already correctly initialized as 0 or 1 + cemu_assert_debug(regCarry != regR); if (regR != regA) x64GenContext->emitter->MOV_dd(regR, regA); x64GenContext->emitter->BT_du8(regCarry, 0); // copy carry register to x86 carry flag @@ -1600,8 +1568,7 @@ bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenCo } else if (imlInstruction->type == PPCREC_IML_TYPE_ATOMIC_CMP_STORE) { - if (!PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction)) - codeGenerationFailed = true; + PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_NO_OP ) { diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp index cca8b61e..4850ed81 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp @@ -91,25 +91,37 @@ void IMLDebug_PrintLivenessRangeInfo(StringBuf& currentLineText, IMLSegment* iml sint32 index = currentLineText.getLen(); while (index < 70) { - debug_printf(" "); + currentLineText.add(" "); index++; } raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; while (subrangeItr) { - if (offset == subrangeItr->start.index) + if (subrangeItr->interval2.start.GetInstructionIndexEx() == offset) { - debug_printf("|%-2d", subrangeItr->GetVirtualRegister()); + if(subrangeItr->interval2.start.IsInstructionIndex() && !subrangeItr->interval2.start.IsOnInputEdge()) + currentLineText.add("."); + else + currentLineText.add("|"); + + currentLineText.addFmt("{:<4}", subrangeItr->GetVirtualRegister()); } - else if (offset >= subrangeItr->start.index && offset < subrangeItr->end.index) + else if (subrangeItr->interval2.end.GetInstructionIndexEx() == offset) { - debug_printf("| "); + if(subrangeItr->interval2.end.IsInstructionIndex() && !subrangeItr->interval2.end.IsOnOutputEdge()) + currentLineText.add("* "); + else + currentLineText.add("| "); + } + else if (subrangeItr->interval2.ContainsInstructionIndexEx(offset)) + { + currentLineText.add("| "); } else { - debug_printf(" "); + currentLineText.add(" "); } - index += 3; + index += 5; // next subrangeItr = subrangeItr->link_allSegmentRanges.next; } @@ -446,7 +458,7 @@ void IMLDebug_DisassembleInstruction(const IMLInstruction& inst, std::string& di void IMLDebug_DumpSegment(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, bool printLivenessRangeInfo) { - StringBuf strOutput(1024); + StringBuf strOutput(4096); strOutput.addFmt("SEGMENT {} | PPC=0x{:08x} Loop-depth {}", IMLDebug_GetSegmentName(ctx, imlSegment), imlSegment->ppcAddress, imlSegment->loopDepth); if (imlSegment->isEnterable) @@ -457,13 +469,13 @@ void IMLDebug_DumpSegment(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, bool { strOutput.addFmt(" InheritOverwrite: {}", IMLDebug_GetSegmentName(ctx, imlSegment->deadCodeEliminationHintSeg)); } - debug_printf("%s\n", strOutput.c_str()); + cemuLog_log(LogType::Force, "{}", strOutput.c_str()); if (printLivenessRangeInfo) { strOutput.reset(); IMLDebug_PrintLivenessRangeInfo(strOutput, imlSegment, RA_INTER_RANGE_START); - debug_printf("%s\n", strOutput.c_str()); + cemuLog_log(LogType::Force, "{}", strOutput.c_str()); } //debug_printf("\n"); strOutput.reset(); @@ -475,53 +487,56 @@ void IMLDebug_DumpSegment(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, bool // don't log NOP instructions if (inst.type == PPCREC_IML_TYPE_NO_OP) continue; - //strOutput.addFmt("{:02x} ", i); - debug_printf(fmt::format("{:02x} ", i).c_str()); + strOutput.reset(); + strOutput.addFmt("{:02x} ", i); + //cemuLog_log(LogType::Force, "{:02x} ", i); disassemblyLine.clear(); IMLDebug_DisassembleInstruction(inst, disassemblyLine); - debug_printf("%s", disassemblyLine.c_str()); + strOutput.add(disassemblyLine); if (printLivenessRangeInfo) { IMLDebug_PrintLivenessRangeInfo(strOutput, imlSegment, i); } - debug_printf("\n"); + cemuLog_log(LogType::Force, "{}", strOutput.c_str()); } // all ranges if (printLivenessRangeInfo) { - debug_printf("Ranges-VirtReg "); + strOutput.reset(); + strOutput.add("Ranges-VirtReg "); raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; while (subrangeItr) { - debug_printf("v%-2d", subrangeItr->GetVirtualRegister()); + strOutput.addFmt("v{:<4}", (uint32)subrangeItr->GetVirtualRegister()); subrangeItr = subrangeItr->link_allSegmentRanges.next; } - debug_printf("\n"); - debug_printf("Ranges-PhysReg "); + cemuLog_log(LogType::Force, "{}", strOutput.c_str()); + strOutput.reset(); + strOutput.add("Ranges-PhysReg "); subrangeItr = imlSegment->raInfo.linkedList_allSubranges; while (subrangeItr) { - debug_printf("p%-2d", subrangeItr->GetPhysicalRegister()); + strOutput.addFmt("p{:<4}", subrangeItr->GetPhysicalRegister()); subrangeItr = subrangeItr->link_allSegmentRanges.next; } - debug_printf("\n"); + cemuLog_log(LogType::Force, "{}", strOutput.c_str()); } // branch info - debug_printf("Links from: "); + strOutput.reset(); + strOutput.add("Links from: "); for (sint32 i = 0; i < imlSegment->list_prevSegments.size(); i++) { if (i) - debug_printf(", "); - debug_printf("%s", IMLDebug_GetSegmentName(ctx, imlSegment->list_prevSegments[i]).c_str()); + strOutput.add(", "); + strOutput.addFmt("{}", IMLDebug_GetSegmentName(ctx, imlSegment->list_prevSegments[i]).c_str()); } - debug_printf("\n"); + cemuLog_log(LogType::Force, "{}", strOutput.c_str()); if (imlSegment->nextSegmentBranchNotTaken) - debug_printf("BranchNotTaken: %s\n", IMLDebug_GetSegmentName(ctx, imlSegment->nextSegmentBranchNotTaken).c_str()); + cemuLog_log(LogType::Force, "BranchNotTaken: {}", IMLDebug_GetSegmentName(ctx, imlSegment->nextSegmentBranchNotTaken).c_str()); if (imlSegment->nextSegmentBranchTaken) - debug_printf("BranchTaken: %s\n", IMLDebug_GetSegmentName(ctx, imlSegment->nextSegmentBranchTaken).c_str()); + cemuLog_log(LogType::Force, "BranchTaken: {}", IMLDebug_GetSegmentName(ctx, imlSegment->nextSegmentBranchTaken).c_str()); if (imlSegment->nextSegmentIsUncertain) - debug_printf("Dynamic target\n"); - debug_printf("\n"); + cemuLog_log(LogType::Force, "Dynamic target"); } void IMLDebug_Dump(ppcImlGenContext_t* ppcImlGenContext, bool printLivenessRangeInfo) @@ -529,6 +544,6 @@ void IMLDebug_Dump(ppcImlGenContext_t* ppcImlGenContext, bool printLivenessRange for (size_t i = 0; i < ppcImlGenContext->segmentList2.size(); i++) { IMLDebug_DumpSegment(ppcImlGenContext, ppcImlGenContext->segmentList2[i], printLivenessRangeInfo); - debug_printf("\n"); + cemuLog_log(LogType::Force, ""); } } diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h index 78863931..7594bc9f 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h @@ -1,6 +1,7 @@ #pragma once using IMLRegID = uint16; // 16 bit ID +using IMLPhysReg = sint32; // arbitrary value that is up to the architecture backend, usually this will be the register index. A value of -1 is reserved and means not assigned // format of IMLReg: // 0-15 (16 bit) IMLRegID diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp index a59b88bd..9b9ce15f 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.cpp @@ -7,6 +7,7 @@ #include "../BackendX64/BackendX64.h" +#include #include struct IMLRARegAbstractLiveness // preliminary liveness info. One entry per register and segment @@ -50,6 +51,45 @@ struct IMLRegisterAllocatorContext }; +struct IMLFixedRegisters +{ + struct Entry + { + Entry(IMLReg reg, IMLPhysRegisterSet physRegSet) : reg(reg), physRegSet(physRegSet) {} + + IMLReg reg; + IMLPhysRegisterSet physRegSet; + }; + boost::container::static_vector listInput; // fixed registers for input edge + boost::container::static_vector listOutput; // fixed registers for output edge +}; + +static void GetInstructionFixedRegisters(IMLInstruction* instruction, IMLFixedRegisters& fixedRegs) +{ + fixedRegs.listInput.clear(); + fixedRegs.listOutput.clear(); + + // x86 specific logic is hardcoded for now + if(instruction->type == PPCREC_IML_TYPE_R_R_R) + { + if(instruction->operation == PPCREC_IML_OP_LEFT_SHIFT || instruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S || instruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) + { + // todo: We can skip this if g_CPUFeatures.x86.bmi2 is set, but for now we just assume it's not so we can properly test increased register pressure + IMLPhysRegisterSet ps; + ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_ECX); + fixedRegs.listInput.emplace_back(instruction->op_r_r_r.regB, ps); + } + } + else if(instruction->type == PPCREC_IML_TYPE_ATOMIC_CMP_STORE) + { + IMLPhysRegisterSet ps; + ps.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE+X86_REG_EAX); + fixedRegs.listInput.emplace_back(instruction->op_atomic_compare_store.regBoolOut, ps); + } + // todo - for volatile registers during call, we can emit a bunch of ranges that cover the output edge of the CALL instruction and use a special vGPR to indicate its not an actually mapped register +} + + uint32 PPCRecRA_getNextIterationIndex() { static uint32 recRACurrentIterationIndex = 0; @@ -119,20 +159,95 @@ void PPCRecRA_identifyLoop(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* iml #define SUBRANGE_LIST_SIZE (128) -sint32 PPCRecRA_countInstructionsUntilNextUse(raLivenessRange* subrange, sint32 startIndex) +sint32 PPCRecRA_countDistanceUntilNextUse2(raLivenessRange* subrange, raInstructionEdge startPosition) { + sint32 startInstructionIndex; + if(startPosition.ConnectsToPreviousSegment()) + startInstructionIndex = 0; + else + startInstructionIndex = startPosition.GetInstructionIndex(); for (sint32 i = 0; i < subrange->list_locations.size(); i++) { - if (subrange->list_locations.data()[i].index >= startIndex) - return subrange->list_locations.data()[i].index - startIndex; + if (subrange->list_locations[i].index >= startInstructionIndex) + { + sint32 preciseIndex = subrange->list_locations[i].index * 2; + cemu_assert_debug(subrange->list_locations[i].isRead || subrange->list_locations[i].isWrite); // locations must have any access + // check read edge + if(subrange->list_locations[i].isRead) + { + if(preciseIndex >= startPosition.GetRaw()) + return preciseIndex - startPosition.GetRaw(); + } + // check write edge + if(subrange->list_locations[i].isWrite) + { + preciseIndex++; + if(preciseIndex >= startPosition.GetRaw()) + return preciseIndex - startPosition.GetRaw(); + } + } } - return INT_MAX; + cemu_assert_debug(subrange->imlSegment->imlList.size() < 10000); + return 10001*2; } -// count how many instructions there are until physRegister is used by any subrange (returns 0 if register is in use at startIndex, and INT_MAX if not used for the remainder of the segment) -sint32 PPCRecRA_countInstructionsUntilNextLocalPhysRegisterUse(IMLSegment* imlSegment, sint32 startIndex, sint32 physRegister) +// returns -1 if there is no fixed register requirement on or after startPosition +sint32 IMLRA_CountDistanceUntilFixedRegUsageInRange(IMLSegment* imlSegment, raLivenessRange* range, raInstructionEdge startPosition, sint32 physRegister, bool& hasFixedAccess) { - sint32 minDistance = INT_MAX; + hasFixedAccess = false; + cemu_assert_debug(startPosition.IsInstructionIndex()); + for(auto& fixedReqEntry : range->list_fixedRegRequirements) + { + if(fixedReqEntry.pos < startPosition) + continue; + if(fixedReqEntry.allowedReg.IsAvailable(physRegister)) + { + hasFixedAccess = true; + return fixedReqEntry.pos.GetRaw() - startPosition.GetRaw(); + } + } + cemu_assert_debug(range->interval2.end.IsInstructionIndex()); + return range->interval2.end.GetRaw() - startPosition.GetRaw(); +} + +sint32 IMLRA_CountDistanceUntilFixedRegUsage(IMLSegment* imlSegment, raInstructionEdge startPosition, sint32 maxDistance, IMLRegID ourRegId, sint32 physRegister) +{ + cemu_assert_debug(startPosition.IsInstructionIndex()); + raInstructionEdge lastPos2; + lastPos2.Set(imlSegment->imlList.size(), false); + + raInstructionEdge endPos; + endPos = startPosition + maxDistance; + if(endPos > lastPos2) + endPos = lastPos2; + IMLFixedRegisters fixedRegs; + if(startPosition.IsOnOutputEdge()) + GetInstructionFixedRegisters(imlSegment->imlList.data()+startPosition.GetInstructionIndex(), fixedRegs); + for(raInstructionEdge currentPos = startPosition; currentPos <= endPos; ++currentPos) + { + if(currentPos.IsOnInputEdge()) + { + GetInstructionFixedRegisters(imlSegment->imlList.data()+currentPos.GetInstructionIndex(), fixedRegs); + } + auto& fixedRegAccess = currentPos.IsOnInputEdge() ? fixedRegs.listInput : fixedRegs.listOutput; + for(auto& fixedRegLoc : fixedRegAccess) + { + if(fixedRegLoc.reg.GetRegID() != ourRegId) + { + cemu_assert_debug(fixedRegLoc.physRegSet.HasExactlyOneAvailable()); // this whole function only makes sense when there is only one fixed register, otherwise there are extra permutations to consider + if(fixedRegLoc.physRegSet.IsAvailable(physRegister)) + return currentPos.GetRaw() - startPosition.GetRaw(); + } + } + } + return endPos.GetRaw() - startPosition.GetRaw(); +} + +// count how many instructions there are until physRegister is used by any subrange or reserved for any fixed register requirement (returns 0 if register is in use at startIndex) +sint32 PPCRecRA_countDistanceUntilNextLocalPhysRegisterUse(IMLSegment* imlSegment, raInstructionEdge startPosition, sint32 physRegister) +{ + cemu_assert_debug(startPosition.IsInstructionIndex()); + sint32 minDistance = (sint32)imlSegment->imlList.size()*2 - startPosition.GetRaw(); // next raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; while(subrangeItr) @@ -142,12 +257,16 @@ sint32 PPCRecRA_countInstructionsUntilNextLocalPhysRegisterUse(IMLSegment* imlSe subrangeItr = subrangeItr->link_allSegmentRanges.next; continue; } - if (startIndex >= subrangeItr->start.index && startIndex < subrangeItr->end.index) + if(subrangeItr->interval2.ContainsEdge(startPosition)) return 0; - if (subrangeItr->start.index >= startIndex) + if (subrangeItr->interval2.end < startPosition) { - minDistance = std::min(minDistance, (subrangeItr->start.index - startIndex)); + subrangeItr = subrangeItr->link_allSegmentRanges.next; + continue; } + cemu_assert_debug(startPosition <= subrangeItr->interval2.start); + sint32 currentDist = subrangeItr->interval2.start.GetRaw() - startPosition.GetRaw(); + minDistance = std::min(minDistance, currentDist); subrangeItr = subrangeItr->link_allSegmentRanges.next; } return minDistance; @@ -155,20 +274,6 @@ sint32 PPCRecRA_countInstructionsUntilNextLocalPhysRegisterUse(IMLSegment* imlSe struct IMLRALivenessTimeline { -// IMLRALivenessTimeline(raLivenessSubrange_t* subrangeChain) -// { -//#ifdef CEMU_DEBUG_ASSERT -// raLivenessSubrange_t* it = subrangeChain; -// raLivenessSubrange_t* prevIt = it; -// while (it) -// { -// cemu_assert_debug(prevIt->start.index <= it->start.index); -// prevIt = it; -// it = it->link_segmentSubrangesGPR.next; -// } -//#endif -// } - IMLRALivenessTimeline() { } @@ -182,12 +287,13 @@ struct IMLRALivenessTimeline // remove all ranges from activeRanges with end <= instructionIndex void ExpireRanges(sint32 instructionIndex) { + __debugbreak(); // maybe replace calls with raInstructionEdge variant? expiredRanges.clear(); size_t count = activeRanges.size(); for (size_t f = 0; f < count; f++) { raLivenessRange* liverange = activeRanges[f]; - if (liverange->end.index <= instructionIndex) + if (liverange->interval2.end.GetInstructionIndex() < instructionIndex) // <= to < since end is now inclusive { #ifdef CEMU_DEBUG_ASSERT if (instructionIndex != RA_INTER_RANGE_END && (liverange->subrangeBranchTaken || liverange->subrangeBranchNotTaken)) @@ -204,28 +310,63 @@ struct IMLRALivenessTimeline activeRanges.resize(count); } + void ExpireRanges(raInstructionEdge expireUpTo) + { + expiredRanges.clear(); + size_t count = activeRanges.size(); + for (size_t f = 0; f < count; f++) + { + raLivenessRange* liverange = activeRanges[f]; + if (liverange->interval2.end < expireUpTo) // this was <= but since end is not inclusive we need to use < + { +#ifdef CEMU_DEBUG_ASSERT + if (!expireUpTo.ConnectsToNextSegment() && (liverange->subrangeBranchTaken || liverange->subrangeBranchNotTaken)) + assert_dbg(); // infinite subranges should not expire +#endif + expiredRanges.emplace_back(liverange); + // remove entry + activeRanges[f] = activeRanges[count-1]; + f--; + count--; + } + } + if(count != activeRanges.size()) + activeRanges.resize(count); + } + std::span GetExpiredRanges() { return { expiredRanges.data(), expiredRanges.size() }; } + std::span GetActiveRanges() + { + return { activeRanges.data(), activeRanges.size() }; + } + + raLivenessRange* GetActiveRangeByVirtualRegId(IMLRegID regId) + { + for(auto& it : activeRanges) + if(it->virtualRegister == regId) + return it; + return nullptr; + } + + raLivenessRange* GetActiveRangeByPhysicalReg(sint32 physReg) + { + cemu_assert_debug(physReg >= 0); + for(auto& it : activeRanges) + if(it->physicalRegister == physReg) + return it; + return nullptr; + } + boost::container::small_vector activeRanges; private: boost::container::small_vector expiredRanges; }; -bool IsRangeOverlapping(raLivenessRange* rangeA, raLivenessRange* rangeB) -{ - if (rangeA->start.index < rangeB->end.index && rangeA->end.index > rangeB->start.index) - return true; - if ((rangeA->start.index == RA_INTER_RANGE_START && rangeA->start.index == rangeB->start.index)) - return true; - if (rangeA->end.index == RA_INTER_RANGE_END && rangeA->end.index == rangeB->end.index) - return true; - return false; -} - // mark occupied registers by any overlapping range as unavailable in physRegSet void PPCRecRA_MaskOverlappingPhysRegForGlobalRange(raLivenessRange* range2, IMLPhysRegisterSet& physRegSet) { @@ -242,7 +383,7 @@ void PPCRecRA_MaskOverlappingPhysRegForGlobalRange(raLivenessRange* range2, IMLP subrangeItr = subrangeItr->link_allSegmentRanges.next; continue; } - if(IsRangeOverlapping(subrange, subrangeItr)) + if(subrange->interval2.IsOverlapping(subrangeItr->interval2)) { if (subrangeItr->GetPhysicalRegister() >= 0) physRegSet.SetReserved(subrangeItr->GetPhysicalRegister()); @@ -253,7 +394,7 @@ void PPCRecRA_MaskOverlappingPhysRegForGlobalRange(raLivenessRange* range2, IMLP } } -bool _livenessRangeStartCompare(raLivenessRange* lhs, raLivenessRange* rhs) { return lhs->start.index < rhs->start.index; } +bool _livenessRangeStartCompare(raLivenessRange* lhs, raLivenessRange* rhs) { return lhs->interval2.start < rhs->interval2.start; } void _sortSegmentAllSubrangesLinkedList(IMLSegment* imlSegment) { @@ -291,13 +432,14 @@ void _sortSegmentAllSubrangesLinkedList(IMLSegment* imlSegment) #ifdef CEMU_DEBUG_ASSERT sint32 count2 = 0; subrangeItr = imlSegment->raInfo.linkedList_allSubranges; - sint32 currentStartIndex = RA_INTER_RANGE_START; + raInstructionEdge currentStartPosition; + currentStartPosition.SetRaw(RA_INTER_RANGE_START); while (subrangeItr) { count2++; - if (subrangeItr->start.index < currentStartIndex) + if (subrangeItr->interval2.start < currentStartPosition) assert_dbg(); - currentStartIndex = subrangeItr->start.index; + currentStartPosition = subrangeItr->interval2.start; // next subrangeItr = subrangeItr->link_allSegmentRanges.next; } @@ -319,74 +461,626 @@ raLivenessRange* IMLRA_GetSubrange(IMLSegment* imlSegment, IMLRegID regId) return it->second; } -raLivenessRange* _GetSubrangeByInstructionIndexAndVirtualReg(IMLSegment* imlSegment, IMLReg regToSearch, sint32 instructionIndex) +struct raFixedRegRequirementWithVGPR { - uint32 regId = regToSearch.GetRegID(); - raLivenessRange* subrangeItr = IMLRA_GetSubrange(imlSegment, regId); - while (subrangeItr) + raInstructionEdge pos; + IMLPhysRegisterSet allowedReg; + IMLRegID regId; +}; + +std::vector IMLRA_BuildSegmentInstructionFixedRegList(IMLSegment* imlSegment) +{ + std::vector frrList; + + size_t index = 0; + IMLUsedRegisters gprTracking; + while (index < imlSegment->imlList.size()) { - if (subrangeItr->start.index <= instructionIndex && subrangeItr->end.index > instructionIndex) - return subrangeItr; - subrangeItr = subrangeItr->link_sameVirtualRegister.next; + IMLFixedRegisters fixedRegs; + GetInstructionFixedRegisters(&imlSegment->imlList[index], fixedRegs); + raInstructionEdge pos; + pos.Set(index, true); + for(auto& fixedRegAccess : fixedRegs.listInput) + { + frrList.emplace_back(pos, fixedRegAccess.physRegSet, fixedRegAccess.reg.GetRegID()); + } + pos = pos + 1; + for(auto& fixedRegAccess : fixedRegs.listOutput) + { + frrList.emplace_back(pos, fixedRegAccess.physRegSet, fixedRegAccess.reg.GetRegID()); + } + index++; } - return nullptr; + return frrList; } -void IMLRA_IsolateRangeOnInstruction(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, raLivenessRange* subrange, sint32 instructionIndex) +boost::container::small_vector IMLRA_GetRangeWithFixedRegReservationOverlappingPos(IMLSegment* imlSegment, raInstructionEdge pos, IMLPhysReg physReg) { - DEBUG_BREAK; + boost::container::small_vector rangeList; + for(raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next) + { + if(!currentRange->interval2.ContainsEdge(pos)) + continue; + IMLPhysRegisterSet allowedRegs; + if(!currentRange->GetAllowedRegistersEx(allowedRegs)) + continue; + if(allowedRegs.IsAvailable(physReg)) + rangeList.emplace_back(currentRange); + } + return rangeList; } void IMLRA_HandleFixedRegisters(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) { - // this works as a pre-pass to actual register allocation. Assigning registers in advance based on fixed requirements (e.g. calling conventions and operations with fixed-reg input/output like x86 DIV/MUL) - // algorithm goes as follows: - // 1) Iterate all instructions in the function from beginning to end and keep a list of active ranges for the currently iterated instruction - // 2) If we encounter an instruction with a fixed register requirement we: - // 2.0) Check if there are any other ranges already using the same fixed-register and if yes, we split them and unassign the register for any follow-up instructions just prior to the current instruction - // 2.1) For inputs: Split the range that needs to be assigned a phys reg on the current instruction. Basically creating a 1-instruction long subrange that we can assign the physical register. RA will then schedule register allocation around that and avoid moves - // 2.2) For outputs: Split the range that needs to be assigned a phys reg on the current instruction - // Q: What if a specific fixed-register is used both for input and output and thus is destructive? A: Create temporary range - // Q: What if we have 3 different inputs that are all the same virtual register? A: Create temporary range - // Q: Assuming the above is implemented, do we even support overlapping two ranges of separate virtual regs on the same phys register? In theory the RA shouldn't care + // first pass - iterate over all ranges with fixed register requirements and split them if they cross the segment border (we can later optimize this) + for(raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next) + { + IMLPhysRegisterSet allowedRegs; + if(!currentRange->GetAllowedRegistersEx(allowedRegs)) + continue; + if(currentRange->interval2.ExtendsPreviousSegment() || currentRange->interval2.ExtendsIntoNextSegment()) + { + PPCRecRA_explodeRange(ppcImlGenContext, currentRange); + // currentRange may be invalidated, therefore iterate from the beginning again (todo - can be optimized) + currentRange = imlSegment->raInfo.linkedList_allSubranges; + } + } + // second pass - look for ranges with conflicting fixed register requirements and split these too (locally) + for(raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next) + { + IMLPhysRegisterSet allowedRegs; + if(!currentRange->GetAllowedRegistersEx(allowedRegs)) + continue; + if(allowedRegs.HasAnyAvailable()) + continue; + cemu_assert_unimplemented(); + } + // third pass - assign fixed registers, split ranges if needed + std::vector frr = IMLRA_BuildSegmentInstructionFixedRegList(imlSegment); + std::unordered_map lastVGPR; + for(size_t i=0; isecond != entry.regId; + else + vgprHasChanged = true; + lastVGPR[physReg] = entry.regId; - // experimental code - //for (size_t i = 0; i < imlSegment->imlList.size(); i++) - //{ - // IMLInstruction& inst = imlSegment->imlList[i]; - // if (inst.type == PPCREC_IML_TYPE_R_R_R) - // { - // if (inst.operation == PPCREC_IML_OP_LEFT_SHIFT) - // { - // // get the virtual reg which needs to be assigned a fixed register - // //IMLUsedRegisters usedReg; - // //inst.CheckRegisterUsage(&usedReg); - // IMLReg rB = inst.op_r_r_r.regB; - // // rB needs to use RCX/ECX - // raLivenessSubrange_t* subrange = _GetSubrangeByInstructionIndexAndVirtualReg(imlSegment, rB, i); - // cemu_assert_debug(subrange->range->physicalRegister < 0); // already has a phys reg assigned - // // make sure RCX/ECX is free - // // split before (if needed) and after instruction so that we get a new 1-instruction long range for which we can assign the physical register - // raLivenessSubrange_t* instructionRange = subrange->start.index < i ? PPCRecRA_splitLocalSubrange(ppcImlGenContext, subrange, i, false) : subrange; - // raLivenessSubrange_t* tailRange = PPCRecRA_splitLocalSubrange(ppcImlGenContext, instructionRange, i+1, false); + if(!vgprHasChanged) + continue; - // } - // } - //} + boost::container::small_vector overlappingRanges = IMLRA_GetRangeWithFixedRegReservationOverlappingPos(imlSegment, entry.pos, physReg); + cemu_assert_debug(!overlappingRanges.empty()); // there should always be at least one range that overlaps corresponding to the fixed register requirement + + for(auto& range : overlappingRanges) + { + if(range->interval2.start < entry.pos) + { + PPCRecRA_splitLocalSubrange2(ppcImlGenContext, range, entry.pos, true); + } + } + } + // finally iterate ranges and assign fixed registers + for(raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next) + { + IMLPhysRegisterSet allowedRegs; + if(!currentRange->GetAllowedRegistersEx(allowedRegs)) + { + cemu_assert_debug(currentRange->list_fixedRegRequirements.empty()); + continue; + } + cemu_assert_debug(allowedRegs.HasExactlyOneAvailable()); + currentRange->SetPhysicalRegister(allowedRegs.GetFirstAvailableReg()); + } + // DEBUG - check for collisions and make sure all ranges with fixed register requirements got their physical register assigned +#ifdef CEMU_DEBUG_ASSERT + for(raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next) + { + IMLPhysRegisterSet allowedRegs; + if(!currentRange->HasPhysicalRegister()) + continue; + for(raLivenessRange* currentRange2 = imlSegment->raInfo.linkedList_allSubranges; currentRange2; currentRange2 = currentRange2->link_allSegmentRanges.next) + { + if(currentRange == currentRange2) + continue; + if(currentRange->interval2.IsOverlapping(currentRange2->interval2)) + { + cemu_assert_debug(currentRange->GetPhysicalRegister() != currentRange2->GetPhysicalRegister()); + } + } + } + for(raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; currentRange; currentRange = currentRange->link_allSegmentRanges.next) + { + IMLPhysRegisterSet allowedRegs; + if(!currentRange->GetAllowedRegistersEx(allowedRegs)) + { + cemu_assert_debug(currentRange->list_fixedRegRequirements.empty()); + continue; + } + cemu_assert_debug(currentRange->HasPhysicalRegister() && allowedRegs.IsAvailable(currentRange->GetPhysicalRegister())); + } +#endif +} + +// we should not split ranges on instructions with tied registers (i.e. where a register encoded as a single parameter is both input and output) +// otherwise the RA algorithm has to assign both ranges the same physical register (not supported yet) and the point of splitting to fit another range is nullified +void IMLRA_MakeSafeSplitPosition(IMLSegment* imlSegment, raInstructionEdge& pos) +{ + // we ignore the instruction for now and just always make it a safe split position + cemu_assert_debug(pos.IsInstructionIndex()); + if(pos.IsOnOutputEdge()) + pos = pos - 1; +} + +// convenience wrapper for IMLRA_MakeSafeSplitPosition +void IMLRA_MakeSafeSplitDistance(IMLSegment* imlSegment, raInstructionEdge startPos, sint32& distance) +{ + cemu_assert_debug(startPos.IsInstructionIndex()); + cemu_assert_debug(distance >= 0); + raInstructionEdge endPos = startPos + distance; + IMLRA_MakeSafeSplitPosition(imlSegment, endPos); + if(endPos < startPos) + { + distance = 0; + return; + } + distance = endPos.GetRaw() - startPos.GetRaw(); +} + +void DbgVerifyAllRanges(IMLRegisterAllocatorContext& ctx); + +class RASpillStrategy +{ +public: + virtual void Apply(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, raLivenessRange* currentRange) = 0; + + sint32 GetCost() + { + return strategyCost; + } + +protected: + void ResetCost() + { + strategyCost = INT_MAX; + } + + sint32 strategyCost; +}; + +class RASpillStrategy_LocalRangeHoleCutting : public RASpillStrategy +{ +public: + void Reset() + { + localRangeHoleCutting.distance = -1; + localRangeHoleCutting.largestHoleSubrange = nullptr; + ResetCost(); + } + + void Evaluate(IMLSegment* imlSegment, raLivenessRange* currentRange, const IMLRALivenessTimeline& timeline, const IMLPhysRegisterSet& allowedRegs) + { + raInstructionEdge currentRangeStart = currentRange->interval2.start; + sint32 requiredSize2 = currentRange->interval2.GetPreciseDistance(); + cemu_assert_debug(localRangeHoleCutting.distance == -1); + cemu_assert_debug(strategyCost == INT_MAX); + if(!currentRangeStart.ConnectsToPreviousSegment()) + { + cemu_assert_debug(currentRangeStart.GetRaw() >= 0); + for (auto candidate : timeline.activeRanges) + { + if (candidate->interval2.ExtendsIntoNextSegment()) + continue; + // new checks (Oct 2024): + if(candidate == currentRange) + continue; + if(candidate->GetPhysicalRegister() < 0) + continue; + if(!allowedRegs.IsAvailable(candidate->GetPhysicalRegister())) + continue; + + sint32 distance2 = PPCRecRA_countDistanceUntilNextUse2(candidate, currentRangeStart); + IMLRA_MakeSafeSplitDistance(imlSegment, currentRangeStart, distance2); + if (distance2 < 2) + continue; + cemu_assert_debug(currentRangeStart.IsInstructionIndex()); + distance2 = std::min(distance2, imlSegment->imlList.size()*2 - currentRangeStart.GetRaw()); // limit distance to end of segment + // calculate split cost of candidate + sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(candidate, currentRangeStart + distance2); + // calculate additional split cost of currentRange if hole is not large enough + if (distance2 < requiredSize2) + { + cost += PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance2); + // we also slightly increase cost in relation to the remaining length (in order to make the algorithm prefer larger holes) + cost += (requiredSize2 - distance2) / 10; + } + // compare cost with previous candidates + if (cost < strategyCost) + { + strategyCost = cost; + localRangeHoleCutting.distance = distance2; + localRangeHoleCutting.largestHoleSubrange = candidate; + } + } + } + } + + void Apply(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, raLivenessRange* currentRange) override + { + cemu_assert_debug(strategyCost != INT_MAX); + sint32 requiredSize2 = currentRange->interval2.GetPreciseDistance(); + raInstructionEdge currentRangeStart = currentRange->interval2.start; + + raInstructionEdge holeStartPosition = currentRangeStart; + raInstructionEdge holeEndPosition = currentRangeStart + localRangeHoleCutting.distance; + raLivenessRange* collisionRange = localRangeHoleCutting.largestHoleSubrange; + + if(collisionRange->interval2.start < holeStartPosition) + { + collisionRange = PPCRecRA_splitLocalSubrange2(nullptr, collisionRange, holeStartPosition, true); + cemu_assert_debug(!collisionRange || collisionRange->interval2.start >= holeStartPosition); // verify if splitting worked at all, tail must be on or after the split point + cemu_assert_debug(!collisionRange || collisionRange->interval2.start >= holeEndPosition); // also verify that the trimmed hole is actually big enough + } + else + { + cemu_assert_unimplemented(); // we still need to trim? + } + // we may also have to cut the current range to fit partially into the hole + if (requiredSize2 > localRangeHoleCutting.distance) + { + raLivenessRange* tailRange = PPCRecRA_splitLocalSubrange2(nullptr, currentRange, currentRangeStart + localRangeHoleCutting.distance, true); + if(tailRange) + { + cemu_assert_debug(tailRange->list_fixedRegRequirements.empty()); // we are not allowed to unassign fixed registers + tailRange->UnsetPhysicalRegister(); + } + } + // verify that the hole is large enough + if(collisionRange) + { + cemu_assert_debug(!collisionRange->interval2.IsOverlapping(currentRange->interval2)); + } + } + +private: + struct + { + sint32 distance; + raLivenessRange* largestHoleSubrange; + }localRangeHoleCutting; +}; + +class RASpillStrategy_AvailableRegisterHole : public RASpillStrategy +{ + // split current range (this is generally only a good choice when the current range is long but has few usages) + public: + void Reset() + { + ResetCost(); + availableRegisterHole.distance = -1; + availableRegisterHole.physRegister = -1; + } + + void Evaluate(IMLSegment* imlSegment, raLivenessRange* currentRange, const IMLRALivenessTimeline& timeline, const IMLPhysRegisterSet& localAvailableRegsMask, const IMLPhysRegisterSet& allowedRegs) + { + sint32 requiredSize2 = currentRange->interval2.GetPreciseDistance(); + + raInstructionEdge currentRangeStart = currentRange->interval2.start; + cemu_assert_debug(strategyCost == INT_MAX); + availableRegisterHole.distance = -1; + availableRegisterHole.physRegister = -1; + if (currentRangeStart.GetRaw() >= 0) + { + if (localAvailableRegsMask.HasAnyAvailable()) + { + sint32 physRegItr = -1; + while (true) + { + physRegItr = localAvailableRegsMask.GetNextAvailableReg(physRegItr + 1); + if (physRegItr < 0) + break; + if(!allowedRegs.IsAvailable(physRegItr)) + continue; + // get size of potential hole for this register + sint32 distance = PPCRecRA_countDistanceUntilNextLocalPhysRegisterUse(imlSegment, currentRangeStart, physRegItr); + + // some instructions may require the same register for another range, check the distance here + sint32 distUntilFixedReg = IMLRA_CountDistanceUntilFixedRegUsage(imlSegment, currentRangeStart, distance, currentRange->GetVirtualRegister(), physRegItr); + if(distUntilFixedReg < distance) + distance = distUntilFixedReg; + + IMLRA_MakeSafeSplitDistance(imlSegment, currentRangeStart, distance); + if (distance < 2) + continue; + // calculate additional cost due to split + cemu_assert_debug(distance < requiredSize2); // should always be true otherwise previous step would have selected this register? + sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance); + // add small additional cost for the remaining range (prefer larger holes) + cost += ((requiredSize2 - distance) / 2) / 10; + if (cost < strategyCost) + { + strategyCost = cost; + availableRegisterHole.distance = distance; + availableRegisterHole.physRegister = physRegItr; + } + } + } + } + } + + void Apply(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, raLivenessRange* currentRange) override + { + cemu_assert_debug(strategyCost != INT_MAX); + raInstructionEdge currentRangeStart = currentRange->interval2.start; + // use available register + raLivenessRange* tailRange = PPCRecRA_splitLocalSubrange2(nullptr, currentRange, currentRangeStart + availableRegisterHole.distance, true); + if(tailRange) + { + cemu_assert_debug(tailRange->list_fixedRegRequirements.empty()); // we are not allowed to unassign fixed registers + tailRange->UnsetPhysicalRegister(); + } + } + + private: + struct + { + sint32 physRegister; + sint32 distance; // size of hole + }availableRegisterHole; +}; + +class RASpillStrategy_ExplodeRange : public RASpillStrategy +{ +public: + void Reset() + { + ResetCost(); + explodeRange.range = nullptr; + explodeRange.distance = -1; + } + + void Evaluate(IMLSegment* imlSegment, raLivenessRange* currentRange, const IMLRALivenessTimeline& timeline, const IMLPhysRegisterSet& allowedRegs) + { + raInstructionEdge currentRangeStart = currentRange->interval2.start; + if(currentRangeStart.ConnectsToPreviousSegment()) + currentRangeStart.Set(0, true); + sint32 requiredSize2 = currentRange->interval2.GetPreciseDistance(); + cemu_assert_debug(strategyCost == INT_MAX); + explodeRange.range = nullptr; + explodeRange.distance = -1; + for (auto candidate : timeline.activeRanges) + { + if (!candidate->interval2.ExtendsIntoNextSegment()) + continue; + // new checks (Oct 2024): + if(candidate == currentRange) + continue; + if(candidate->GetPhysicalRegister() < 0) + continue; + if(!allowedRegs.IsAvailable(candidate->GetPhysicalRegister())) + continue; + + sint32 distance = PPCRecRA_countDistanceUntilNextUse2(candidate, currentRangeStart); + IMLRA_MakeSafeSplitDistance(imlSegment, currentRangeStart, distance); + if( distance < 2) + continue; + sint32 cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate); + // if the hole is not large enough, add cost of splitting current subrange + if (distance < requiredSize2) + { + cost += PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance); + // add small additional cost for the remaining range (prefer larger holes) + cost += ((requiredSize2 - distance) / 2) / 10; + } + // compare with current best candidate for this strategy + if (cost < strategyCost) + { + strategyCost = cost; + explodeRange.distance = distance; + explodeRange.range = candidate; + } + } + } + + void Apply(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, raLivenessRange* currentRange) override + { + raInstructionEdge currentRangeStart = currentRange->interval2.start; + if(currentRangeStart.ConnectsToPreviousSegment()) + currentRangeStart.Set(0, true); + sint32 requiredSize2 = currentRange->interval2.GetPreciseDistance(); + // explode range + PPCRecRA_explodeRange(nullptr, explodeRange.range); + // split current subrange if necessary + if( requiredSize2 > explodeRange.distance) + { + raLivenessRange* tailRange = PPCRecRA_splitLocalSubrange2(nullptr, currentRange, currentRangeStart+explodeRange.distance, true); + if(tailRange) + { + cemu_assert_debug(tailRange->list_fixedRegRequirements.empty()); // we are not allowed to unassign fixed registers + tailRange->UnsetPhysicalRegister(); + } + } + } + +private: + struct + { + raLivenessRange* range; + sint32 distance; // size of hole + // note: If we explode a range, we still have to check the size of the hole that becomes available, if too small then we need to add cost of splitting local subrange + }explodeRange; +}; + + +class RASpillStrategy_ExplodeRangeInter : public RASpillStrategy +{ +public: + void Reset() + { + ResetCost(); + explodeRange.range = nullptr; + explodeRange.distance = -1; + } + + void Evaluate(IMLSegment* imlSegment, raLivenessRange* currentRange, const IMLRALivenessTimeline& timeline, const IMLPhysRegisterSet& allowedRegs) + { + // explode the range with the least cost + cemu_assert_debug(strategyCost == INT_MAX); + cemu_assert_debug(explodeRange.range == nullptr && explodeRange.distance == -1); + for(auto candidate : timeline.activeRanges) + { + if (!candidate->interval2.ExtendsIntoNextSegment()) + continue; + // only select candidates that clash with current subrange + if (candidate->GetPhysicalRegister() < 0 && candidate != currentRange) + continue; + // and also filter any that dont meet fixed register requirements + if(!allowedRegs.IsAvailable(candidate->GetPhysicalRegister())) + continue; + sint32 cost; + cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate); + // compare with current best candidate for this strategy + if (cost < strategyCost) + { + strategyCost = cost; + explodeRange.distance = INT_MAX; + explodeRange.range = candidate; + } + } + // add current range as a candidate too + sint32 ownCost; + ownCost = PPCRecRARange_estimateCostAfterRangeExplode(currentRange); + if (ownCost < strategyCost) + { + strategyCost = ownCost; + explodeRange.distance = INT_MAX; + explodeRange.range = currentRange; + } + } + + void Apply(ppcImlGenContext_t* ctx, IMLSegment* imlSegment, raLivenessRange* currentRange) override + { + cemu_assert_debug(strategyCost != INT_MAX); + PPCRecRA_explodeRange(ctx, explodeRange.range); + } + +private: + struct + { + raLivenessRange* range; + sint32 distance; // size of hole + // note: If we explode a range, we still have to check the size of the hole that becomes available, if too small then we need to add cost of splitting local subrange + }explodeRange; +}; + +// filter any registers from candidatePhysRegSet which cannot be used by currentRange due to fixed register requirements within the range that it occupies +void IMLRA_FilterReservedFixedRegisterRequirementsForSegment(IMLRegisterAllocatorContext& ctx, raLivenessRange* currentRange, IMLPhysRegisterSet& candidatePhysRegSet) +{ + IMLSegment* seg = currentRange->imlSegment; + if(seg->imlList.empty()) + return; // there can be no fixed register requirements if there are no instructions + + raInstructionEdge firstPos = currentRange->interval2.start; + if(currentRange->interval2.start.ConnectsToPreviousSegment()) + firstPos.SetRaw(0); + else if(currentRange->interval2.start.ConnectsToNextSegment()) + firstPos.Set(seg->imlList.size()-1, false); + + raInstructionEdge lastPos = currentRange->interval2.end; + if(currentRange->interval2.end.ConnectsToPreviousSegment()) + lastPos.SetRaw(0); + else if(currentRange->interval2.end.ConnectsToNextSegment()) + lastPos.Set(seg->imlList.size()-1, false); + cemu_assert_debug(firstPos <= lastPos); + + IMLRegID ourRegId = currentRange->GetVirtualRegister(); + + IMLFixedRegisters fixedRegs; + if(firstPos.IsOnOutputEdge()) + GetInstructionFixedRegisters(seg->imlList.data()+firstPos.GetInstructionIndex(), fixedRegs); + for(raInstructionEdge currentPos = firstPos; currentPos <= lastPos; ++currentPos) + { + if(currentPos.IsOnInputEdge()) + { + GetInstructionFixedRegisters(seg->imlList.data()+currentPos.GetInstructionIndex(), fixedRegs); + } + auto& fixedRegAccess = currentPos.IsOnInputEdge() ? fixedRegs.listInput : fixedRegs.listOutput; + for(auto& fixedRegLoc : fixedRegAccess) + { + if(fixedRegLoc.reg.GetRegID() != ourRegId) + candidatePhysRegSet.RemoveRegisters(fixedRegLoc.physRegSet); + } + } +} + +// filter out any registers along the range cluster +void IMLRA_FilterReservedFixedRegisterRequirementsForCluster(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment, raLivenessRange* currentRange, IMLPhysRegisterSet& candidatePhysRegSet) +{ + cemu_assert_debug(currentRange->imlSegment == imlSegment); + if(currentRange->interval2.ExtendsPreviousSegment() || currentRange->interval2.ExtendsIntoNextSegment()) + { + auto clusterRanges = currentRange->GetAllSubrangesInCluster(); + for(auto& rangeIt : clusterRanges) + { + IMLRA_FilterReservedFixedRegisterRequirementsForSegment(ctx, rangeIt, candidatePhysRegSet); + if(!candidatePhysRegSet.HasAnyAvailable()) + break; + } + return; + } + IMLRA_FilterReservedFixedRegisterRequirementsForSegment(ctx, currentRange, candidatePhysRegSet); +} + +void __DebugTestA(IMLSegment* imlSegment) +{ + // iterate all ranges + raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; + while(subrangeItr) + { + if(!subrangeItr->list_fixedRegRequirements.empty()) + { + cemu_assert_debug(subrangeItr->HasPhysicalRegister()); + } + subrangeItr = subrangeItr->link_allSegmentRanges.next; + } } bool IMLRA_AssignSegmentRegisters(IMLRegisterAllocatorContext& ctx, ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment) { + DbgVerifyAllRanges(ctx); // sort subranges ascending by start index _sortSegmentAllSubrangesLinkedList(imlSegment); IMLRALivenessTimeline livenessTimeline; raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; + raInstructionEdge lastInstructionEdge; + lastInstructionEdge.SetRaw(RA_INTER_RANGE_END); + + struct + { + RASpillStrategy_LocalRangeHoleCutting localRangeHoleCutting; + RASpillStrategy_AvailableRegisterHole availableRegisterHole; + RASpillStrategy_ExplodeRange explodeRange; + // for ranges that connect to follow up segments: + RASpillStrategy_ExplodeRangeInter explodeRangeInter; + }strategy; + + sint32 dbgIndex = 0; while(subrangeItr) { - sint32 currentIndex = subrangeItr->start.index; + raInstructionEdge currentRangeStart = subrangeItr->interval2.start; // used to be currentIndex before refactor PPCRecRA_debugValidateSubrange(subrangeItr); - livenessTimeline.ExpireRanges(std::min(currentIndex, RA_INTER_RANGE_END-1)); // expire up to currentIndex (inclusive), but exclude infinite ranges + + // below used to be: std::min(currentIndex, RA_INTER_RANGE_END-1) + livenessTimeline.ExpireRanges((currentRangeStart > lastInstructionEdge) ? lastInstructionEdge : currentRangeStart); // expire up to currentIndex (inclusive), but exclude infinite ranges + // note: The logic here is complicated in regards to whether the instruction index should be inclusive or exclusive. Find a way to simplify? + // if subrange already has register assigned then add it to the active list and continue if (subrangeItr->GetPhysicalRegister() >= 0) { @@ -402,241 +1096,106 @@ bool IMLRA_AssignSegmentRegisters(IMLRegisterAllocatorContext& ctx, ppcImlGenCon subrangeItr = subrangeItr->link_allSegmentRanges.next; continue; } + // ranges with fixed register requirements should already have a phys register assigned + if(!subrangeItr->list_fixedRegRequirements.empty()) + { + cemu_assert_debug(subrangeItr->HasPhysicalRegister()); + } // find free register for current subrangeItr and segment IMLRegFormat regBaseFormat = ctx.GetBaseFormatByRegId(subrangeItr->GetVirtualRegister()); - IMLPhysRegisterSet physRegSet = ctx.raParam->GetPhysRegPool(regBaseFormat); - cemu_assert_debug(physRegSet.HasAnyAvailable()); // register uses type with no valid pool + IMLPhysRegisterSet candidatePhysRegSet = ctx.raParam->GetPhysRegPool(regBaseFormat); + cemu_assert_debug(candidatePhysRegSet.HasAnyAvailable()); // no valid pool provided for this register type + + IMLPhysRegisterSet allowedRegs = subrangeItr->GetAllowedRegisters(candidatePhysRegSet); + cemu_assert_debug(allowedRegs.HasAnyAvailable()); // if zero regs are available, then this range needs to be split to avoid mismatching register requirements (do this in the initial pass to keep the code here simpler) + candidatePhysRegSet &= allowedRegs; + + __DebugTestA(imlSegment); for (auto& liverangeItr : livenessTimeline.activeRanges) { cemu_assert_debug(liverangeItr->GetPhysicalRegister() >= 0); - physRegSet.SetReserved(liverangeItr->GetPhysicalRegister()); + candidatePhysRegSet.SetReserved(liverangeItr->GetPhysicalRegister()); } // check intersections with other ranges and determine allowed registers - IMLPhysRegisterSet localAvailableRegsMask = physRegSet; // mask of registers that are currently not used (does not include range checks in other segments) - if(physRegSet.HasAnyAvailable()) + IMLPhysRegisterSet localAvailableRegsMask = candidatePhysRegSet; // mask of registers that are currently not used (does not include range checks in other segments) + if(candidatePhysRegSet.HasAnyAvailable()) { - // check globally in all segments - PPCRecRA_MaskOverlappingPhysRegForGlobalRange(subrangeItr, physRegSet); + // check for overlaps on a global scale (subrangeItr can be part of a larger range cluster across multiple segments) + PPCRecRA_MaskOverlappingPhysRegForGlobalRange(subrangeItr, candidatePhysRegSet); } - if (!physRegSet.HasAnyAvailable()) + // some target instructions may enforce specific registers (e.g. common on X86 where something like SHL , CL forces CL as the count register) + // we determine the list of allowed registers here + // this really only works if we assume single-register requirements (otherwise its better not to filter out early and instead allow register corrections later but we don't support this yet) + if (candidatePhysRegSet.HasAnyAvailable()) { - struct - { - // estimated costs and chosen candidates for the different spill strategies - // hole cutting into a local range - struct - { - sint32 distance; - raLivenessRange* largestHoleSubrange; - sint32 cost; // additional cost of choosing this candidate - }localRangeHoleCutting; - // split current range (this is generally only a good choice when the current range is long but rarely used) - struct - { - sint32 cost; - sint32 physRegister; - sint32 distance; // size of hole - }availableRegisterHole; - // explode a inter-segment range (prefer ranges that are not read/written in this segment) - struct - { - raLivenessRange* range; - sint32 cost; - sint32 distance; // size of hole - // note: If we explode a range, we still have to check the size of the hole that becomes available, if too small then we need to add cost of splitting local subrange - }explodeRange; - // todo - add more strategies, make cost estimation smarter (for example, in some cases splitting can have reduced or no cost if read/store can be avoided due to data flow) - }spillStrategies; - // cant assign register - // there might be registers available, we just can't use them due to range conflicts - if (subrangeItr->end.index != RA_INTER_RANGE_END) - { - // range ends in current segment - - // Current algo looks like this: - // 1) Get the size of the largest possible hole that we can cut into any of the live local subranges - // 1.1) Check if the hole is large enough to hold the current subrange - // 2) If yes, cut hole and return false (full retry) - // 3) If no, try to reuse free register (need to determine how large the region is we can use) - // 4) If there is no free register or the range is extremely short go back to step 1+2 but additionally split the current subrange at where the hole ends - - cemu_assert_debug(currentIndex == subrangeItr->start.index); - - sint32 requiredSize = subrangeItr->end.index - subrangeItr->start.index; - // evaluate strategy: Cut hole into local subrange - spillStrategies.localRangeHoleCutting.distance = -1; - spillStrategies.localRangeHoleCutting.largestHoleSubrange = nullptr; - spillStrategies.localRangeHoleCutting.cost = INT_MAX; - if (currentIndex >= 0) - { - for (auto candidate : livenessTimeline.activeRanges) - { - if (candidate->end.index == RA_INTER_RANGE_END) - continue; - sint32 distance = PPCRecRA_countInstructionsUntilNextUse(candidate, currentIndex); - if (distance < 2) - continue; // not even worth the consideration - // calculate split cost of candidate - sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(candidate, currentIndex + distance); - // calculate additional split cost of currentRange if hole is not large enough - if (distance < requiredSize) - { - cost += PPCRecRARange_estimateAdditionalCostAfterSplit(subrangeItr, currentIndex + distance); - // we also slightly increase cost in relation to the remaining length (in order to make the algorithm prefer larger holes) - cost += (requiredSize - distance) / 10; - } - // compare cost with previous candidates - if (cost < spillStrategies.localRangeHoleCutting.cost) - { - spillStrategies.localRangeHoleCutting.cost = cost; - spillStrategies.localRangeHoleCutting.distance = distance; - spillStrategies.localRangeHoleCutting.largestHoleSubrange = candidate; - } - } - } - // evaluate strategy: Split current range to fit in available holes - // todo - are checks required to avoid splitting on the suffix instruction? - spillStrategies.availableRegisterHole.cost = INT_MAX; - spillStrategies.availableRegisterHole.distance = -1; - spillStrategies.availableRegisterHole.physRegister = -1; - if (currentIndex >= 0) - { - if (localAvailableRegsMask.HasAnyAvailable()) - { - sint32 physRegItr = -1; - while (true) - { - physRegItr = localAvailableRegsMask.GetNextAvailableReg(physRegItr + 1); - if (physRegItr < 0) - break; - // get size of potential hole for this register - sint32 distance = PPCRecRA_countInstructionsUntilNextLocalPhysRegisterUse(imlSegment, currentIndex, physRegItr); - if (distance < 2) - continue; // not worth consideration - // calculate additional cost due to split - if (distance >= requiredSize) - assert_dbg(); // should not happen or else we would have selected this register - sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(subrangeItr, currentIndex + distance); - // add small additional cost for the remaining range (prefer larger holes) - cost += (requiredSize - distance) / 10; - if (cost < spillStrategies.availableRegisterHole.cost) - { - spillStrategies.availableRegisterHole.cost = cost; - spillStrategies.availableRegisterHole.distance = distance; - spillStrategies.availableRegisterHole.physRegister = physRegItr; - } - } - } - } - // evaluate strategy: Explode inter-segment ranges - spillStrategies.explodeRange.cost = INT_MAX; - spillStrategies.explodeRange.range = nullptr; - spillStrategies.explodeRange.distance = -1; - for (auto candidate : livenessTimeline.activeRanges) - { - if (candidate->end.index != RA_INTER_RANGE_END) - continue; - sint32 distance = PPCRecRA_countInstructionsUntilNextUse(candidate, currentIndex); - if( distance < 2) - continue; - sint32 cost; - cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate); - // if the hole is not large enough, add cost of splitting current subrange - if (distance < requiredSize) - { - cost += PPCRecRARange_estimateAdditionalCostAfterSplit(subrangeItr, currentIndex + distance); - // add small additional cost for the remaining range (prefer larger holes) - cost += (requiredSize - distance) / 10; - } - // compare with current best candidate for this strategy - if (cost < spillStrategies.explodeRange.cost) - { - spillStrategies.explodeRange.cost = cost; - spillStrategies.explodeRange.distance = distance; - spillStrategies.explodeRange.range = candidate; - } - } - // choose strategy - if (spillStrategies.explodeRange.cost != INT_MAX && spillStrategies.explodeRange.cost <= spillStrategies.localRangeHoleCutting.cost && spillStrategies.explodeRange.cost <= spillStrategies.availableRegisterHole.cost) - { - // explode range - PPCRecRA_explodeRange(ppcImlGenContext, spillStrategies.explodeRange.range); - // split current subrange if necessary - if( requiredSize > spillStrategies.explodeRange.distance) - PPCRecRA_splitLocalSubrange(ppcImlGenContext, subrangeItr, currentIndex+spillStrategies.explodeRange.distance, true); - } - else if (spillStrategies.availableRegisterHole.cost != INT_MAX && spillStrategies.availableRegisterHole.cost <= spillStrategies.explodeRange.cost && spillStrategies.availableRegisterHole.cost <= spillStrategies.localRangeHoleCutting.cost) - { - // use available register - PPCRecRA_splitLocalSubrange(ppcImlGenContext, subrangeItr, currentIndex + spillStrategies.availableRegisterHole.distance, true); - } - else if (spillStrategies.localRangeHoleCutting.cost != INT_MAX && spillStrategies.localRangeHoleCutting.cost <= spillStrategies.explodeRange.cost && spillStrategies.localRangeHoleCutting.cost <= spillStrategies.availableRegisterHole.cost) - { - // cut hole - PPCRecRA_splitLocalSubrange(ppcImlGenContext, spillStrategies.localRangeHoleCutting.largestHoleSubrange, currentIndex + spillStrategies.localRangeHoleCutting.distance, true); - // split current subrange if necessary - if (requiredSize > spillStrategies.localRangeHoleCutting.distance) - PPCRecRA_splitLocalSubrange(ppcImlGenContext, subrangeItr, currentIndex + spillStrategies.localRangeHoleCutting.distance, true); - } - else if (subrangeItr->start.index == RA_INTER_RANGE_START) - { - // alternative strategy if we have no other choice: explode current range - PPCRecRA_explodeRange(ppcImlGenContext, subrangeItr); - } - else - assert_dbg(); - - return false; - } - else - { - // range exceeds segment border - // simple but bad solution -> explode the entire range (no longer allow it to cross segment boundaries) - // better solutions: 1) Depending on the situation, we can explode other ranges to resolve the conflict. Thus we should explode the range with the lowest extra cost - // 2) Or we explode the range only partially - // explode the range with the least cost - spillStrategies.explodeRange.cost = INT_MAX; - spillStrategies.explodeRange.range = nullptr; - spillStrategies.explodeRange.distance = -1; - for(auto candidate : livenessTimeline.activeRanges) - { - if (candidate->end.index != RA_INTER_RANGE_END) - continue; - // only select candidates that clash with current subrange - if (candidate->GetPhysicalRegister() < 0 && candidate != subrangeItr) - continue; - - sint32 cost; - cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate); - // compare with current best candidate for this strategy - if (cost < spillStrategies.explodeRange.cost) - { - spillStrategies.explodeRange.cost = cost; - spillStrategies.explodeRange.distance = INT_MAX; - spillStrategies.explodeRange.range = candidate; - } - } - // add current range as a candidate too - sint32 ownCost; - ownCost = PPCRecRARange_estimateCostAfterRangeExplode(subrangeItr); - if (ownCost < spillStrategies.explodeRange.cost) - { - spillStrategies.explodeRange.cost = ownCost; - spillStrategies.explodeRange.distance = INT_MAX; - spillStrategies.explodeRange.range = subrangeItr; - } - if (spillStrategies.explodeRange.cost == INT_MAX) - assert_dbg(); // should not happen - PPCRecRA_explodeRange(ppcImlGenContext, spillStrategies.explodeRange.range); - } - return false; + IMLRA_FilterReservedFixedRegisterRequirementsForCluster(ctx, imlSegment, subrangeItr, candidatePhysRegSet); } - // assign register to range - //subrangeItr->SetPhysicalRegister(physRegSet.GetFirstAvailableReg()); - subrangeItr->SetPhysicalRegisterForCluster(physRegSet.GetFirstAvailableReg()); - livenessTimeline.AddActiveRange(subrangeItr); - // next - subrangeItr = subrangeItr->link_allSegmentRanges.next; + if(candidatePhysRegSet.HasAnyAvailable()) + { + // use free register + subrangeItr->SetPhysicalRegisterForCluster(candidatePhysRegSet.GetFirstAvailableReg()); + livenessTimeline.AddActiveRange(subrangeItr); + subrangeItr = subrangeItr->link_allSegmentRanges.next; // next + continue; + } + __DebugTestA(imlSegment); + // there is no free register for the entire range + // evaluate different strategies of splitting ranges to free up another register or shorten the current range + strategy.localRangeHoleCutting.Reset(); + strategy.availableRegisterHole.Reset(); + strategy.explodeRange.Reset(); + // cant assign register + // there might be registers available, we just can't use them due to range conflicts + RASpillStrategy* selectedStrategy = nullptr; + auto SelectStrategyIfBetter = [&selectedStrategy](RASpillStrategy& newStrategy) + { + if(newStrategy.GetCost() == INT_MAX) + return; + if(selectedStrategy == nullptr || newStrategy.GetCost() < selectedStrategy->GetCost()) + selectedStrategy = &newStrategy; + }; + + if (!subrangeItr->interval2.ExtendsIntoNextSegment()) + { + // range ends in current segment, use local strategies + // evaluate strategy: Cut hole into local subrange + strategy.localRangeHoleCutting.Evaluate(imlSegment, subrangeItr, livenessTimeline, allowedRegs); + SelectStrategyIfBetter(strategy.localRangeHoleCutting); + // evaluate strategy: Split current range to fit in available holes + // todo - are checks required to avoid splitting on the suffix instruction? + strategy.availableRegisterHole.Evaluate(imlSegment, subrangeItr, livenessTimeline, localAvailableRegsMask, allowedRegs); + SelectStrategyIfBetter(strategy.availableRegisterHole); + // evaluate strategy: Explode inter-segment ranges + strategy.explodeRange.Evaluate(imlSegment, subrangeItr, livenessTimeline, allowedRegs); + SelectStrategyIfBetter(strategy.explodeRange); + __DebugTestA(imlSegment); + } + else // if subrangeItr->interval2.ExtendsIntoNextSegment() + { + strategy.explodeRangeInter.Reset(); + strategy.explodeRangeInter.Evaluate(imlSegment, subrangeItr, livenessTimeline, allowedRegs); + SelectStrategyIfBetter(strategy.explodeRangeInter); + __DebugTestA(imlSegment); + } + // choose strategy + if(selectedStrategy) + { + selectedStrategy->Apply(ppcImlGenContext, imlSegment, subrangeItr); + __DebugTestA(imlSegment); + } + else + { + // none of the evulated strategies can be applied, this should only happen if the segment extends into the next segment(s) for which we have no good strategy + cemu_assert_debug(subrangeItr->interval2.ExtendsPreviousSegment()); + // alternative strategy if we have no other choice: explode current range + PPCRecRA_explodeRange(ppcImlGenContext, subrangeItr); + __DebugTestA(imlSegment); + } + // DEBUG BEGIN + DbgVerifyAllRanges(ctx); + dbgIndex++; + // DEBUG END + return false; } return true; } @@ -674,154 +1233,6 @@ void IMLRA_AssignRegisters(IMLRegisterAllocatorContext& ctx, ppcImlGenContext_t* } } -inline IMLReg _MakeNativeReg(IMLRegFormat baseFormat, IMLRegID regId) -{ - return IMLReg(baseFormat, baseFormat, 0, regId); -} - -void PPCRecRA_insertGPRLoadInstructions(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment, sint32 insertIndex, std::span loadList) -{ - PPCRecompiler_pushBackIMLInstructions(imlSegment, insertIndex, loadList.size()); - for (sint32 i = 0; i < loadList.size(); i++) - { - IMLRegFormat baseFormat = ctx.regIdToBaseFormat[loadList[i]->GetVirtualRegister()]; - cemu_assert_debug(baseFormat != IMLRegFormat::INVALID_FORMAT); - imlSegment->imlList[insertIndex + i].make_r_name(_MakeNativeReg(baseFormat, loadList[i]->GetPhysicalRegister()), loadList[i]->GetName()); - } -} - -void PPCRecRA_insertGPRStoreInstructions(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment, sint32 insertIndex, std::span storeList) -{ - PPCRecompiler_pushBackIMLInstructions(imlSegment, insertIndex, storeList.size()); - for (size_t i = 0; i < storeList.size(); i++) - { - IMLRegFormat baseFormat = ctx.regIdToBaseFormat[storeList[i]->GetVirtualRegister()]; - cemu_assert_debug(baseFormat != IMLRegFormat::INVALID_FORMAT); - imlSegment->imlList[insertIndex + i].make_name_r(storeList[i]->GetName(), _MakeNativeReg(baseFormat, storeList[i]->GetPhysicalRegister())); - } -} - -void IMLRA_GenerateSegmentMoveInstructions(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment) -{ - std::unordered_map virtId2PhysRegIdMap; // key = virtual register, value = physical register - IMLRALivenessTimeline livenessTimeline; - sint32 index = 0; - sint32 suffixInstructionCount = imlSegment->HasSuffixInstruction() ? 1 : 0; - // load register ranges that are supplied from previous segments - raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; - while(subrangeItr) - { - if (subrangeItr->start.index == RA_INTER_RANGE_START) - { - livenessTimeline.AddActiveRange(subrangeItr); -#ifdef CEMU_DEBUG_ASSERT - // load GPR - if (subrangeItr->_noLoad == false) - { - assert_dbg(); - } - // update translation table - cemu_assert_debug(!virtId2PhysRegIdMap.contains(subrangeItr->GetVirtualRegister())); -#endif - virtId2PhysRegIdMap.try_emplace(subrangeItr->GetVirtualRegister(), subrangeItr->GetPhysicalRegister()); - } - // next - subrangeItr = subrangeItr->link_allSegmentRanges.next; - } - // process instructions - while(index < imlSegment->imlList.size() + 1) - { - // expire ranges - livenessTimeline.ExpireRanges(index); - for (auto& expiredRange : livenessTimeline.GetExpiredRanges()) - { - // update translation table - virtId2PhysRegIdMap.erase(expiredRange->GetVirtualRegister()); - // store GPR if required - // special care has to be taken to execute any stores before the suffix instruction since trailing instructions may not get executed - if (expiredRange->hasStore) - { - PPCRecRA_insertGPRStoreInstructions(ctx, imlSegment, std::min(index, imlSegment->imlList.size() - suffixInstructionCount), {&expiredRange, 1}); - index++; - } - } - - // load new ranges - subrangeItr = imlSegment->raInfo.linkedList_allSubranges; - while(subrangeItr) - { - if (subrangeItr->start.index == index) - { - livenessTimeline.AddActiveRange(subrangeItr); - // load GPR - // similar to stores, any loads for the next segment need to happen before the suffix instruction - // however, ranges that exit the segment at the end but do not cover the suffix instruction are illegal (e.g. RA_INTER_RANGE_END to RA_INTER_RANGE_END subrange) - // this is to prevent the RA from inserting store/load instructions after the suffix instruction - if (imlSegment->HasSuffixInstruction()) - { - cemu_assert_debug(subrangeItr->start.index <= imlSegment->GetSuffixInstructionIndex()); - } - if (subrangeItr->_noLoad == false) - { - PPCRecRA_insertGPRLoadInstructions(ctx, imlSegment, std::min(index, imlSegment->imlList.size() - suffixInstructionCount), {&subrangeItr , 1}); - index++; - subrangeItr->start.index--; - } - // update translation table - virtId2PhysRegIdMap.insert_or_assign(subrangeItr->GetVirtualRegister(), subrangeItr->GetPhysicalRegister()); - } - subrangeItr = subrangeItr->link_allSegmentRanges.next; - } - // rewrite registers - if (index < imlSegment->imlList.size()) - imlSegment->imlList[index].RewriteGPR(virtId2PhysRegIdMap); - // next iml instruction - index++; - } - // expire infinite subranges (subranges which cross the segment border) - std::vector loadStoreList; - livenessTimeline.ExpireRanges(RA_INTER_RANGE_END); - for (auto liverange : livenessTimeline.GetExpiredRanges()) - { - // update translation table - virtId2PhysRegIdMap.erase(liverange->GetVirtualRegister()); - // store GPR - if (liverange->hasStore) - loadStoreList.emplace_back(liverange); - } - cemu_assert_debug(livenessTimeline.activeRanges.empty()); - if (!loadStoreList.empty()) - PPCRecRA_insertGPRStoreInstructions(ctx, imlSegment, imlSegment->imlList.size() - suffixInstructionCount, loadStoreList); - // load subranges for next segments - subrangeItr = imlSegment->raInfo.linkedList_allSubranges; - loadStoreList.clear(); - while(subrangeItr) - { - if (subrangeItr->start.index == RA_INTER_RANGE_END) - { - livenessTimeline.AddActiveRange(subrangeItr); - // load GPR - if (subrangeItr->_noLoad == false) - loadStoreList.emplace_back(subrangeItr); - // update translation table - virtId2PhysRegIdMap.try_emplace(subrangeItr->GetVirtualRegister(), subrangeItr->GetPhysicalRegister()); - } - // next - subrangeItr = subrangeItr->link_allSegmentRanges.next; - } - if (!loadStoreList.empty()) - PPCRecRA_insertGPRLoadInstructions(ctx, imlSegment, imlSegment->imlList.size() - suffixInstructionCount, loadStoreList); -} - -void IMLRA_GenerateMoveInstructions(IMLRegisterAllocatorContext& ctx) -{ - for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++) - { - IMLSegment* imlSegment = ctx.deprGenContext->segmentList2[s]; - IMLRA_GenerateSegmentMoveInstructions(ctx, imlSegment); - } -} - void IMLRA_ReshapeForRegisterAllocation(ppcImlGenContext_t* ppcImlGenContext) { // insert empty segments after every non-taken branch if the linked segment has more than one input @@ -937,7 +1348,15 @@ raLivenessRange* PPCRecRA_convertToMappedRanges(IMLRegisterAllocatorContext& ctx #ifdef CEMU_DEBUG_ASSERT cemu_assert_debug(IMLRA_GetSubrange(imlSegment, vGPR) == nullptr); #endif - raLivenessRange* subrange = PPCRecRA_createSubrange(ctx.deprGenContext, imlSegment, vGPR, name, abstractRange->usageStart, abstractRange->usageEnd); + cemu_assert_debug( + (abstractRange->usageStart == abstractRange->usageEnd && (abstractRange->usageStart == RA_INTER_RANGE_START || abstractRange->usageStart == RA_INTER_RANGE_END)) || + abstractRange->usageStart < abstractRange->usageEnd); // usageEnd is exclusive so it should always be larger + sint32 inclusiveEnd = abstractRange->usageEnd; + if(inclusiveEnd != RA_INTER_RANGE_START && inclusiveEnd != RA_INTER_RANGE_END) + inclusiveEnd--; // subtract one, because usageEnd is exclusive, but the end value of the interval passed to createSubrange is inclusive + raInterval interval; + interval.SetInterval(abstractRange->usageStart, true, inclusiveEnd, true); + raLivenessRange* subrange = PPCRecRA_createSubrange2(ctx.deprGenContext, imlSegment, vGPR, name, interval.start, interval.end); // traverse forward if (abstractRange->usageEnd == RA_INTER_RANGE_END) { @@ -948,7 +1367,7 @@ raLivenessRange* PPCRecRA_convertToMappedRanges(IMLRegisterAllocatorContext& ctx { subrange->subrangeBranchTaken = PPCRecRA_convertToMappedRanges(ctx, imlSegment->nextSegmentBranchTaken, vGPR, name); subrange->subrangeBranchTaken->previousRanges.push_back(subrange); - cemu_assert_debug(subrange->subrangeBranchTaken->start.index == RA_INTER_RANGE_START); + cemu_assert_debug(subrange->subrangeBranchTaken->interval2.ExtendsPreviousSegment()); } } if (imlSegment->nextSegmentBranchNotTaken) @@ -958,7 +1377,7 @@ raLivenessRange* PPCRecRA_convertToMappedRanges(IMLRegisterAllocatorContext& ctx { subrange->subrangeBranchNotTaken = PPCRecRA_convertToMappedRanges(ctx, imlSegment->nextSegmentBranchNotTaken, vGPR, name); subrange->subrangeBranchNotTaken->previousRanges.push_back(subrange); - cemu_assert_debug(subrange->subrangeBranchNotTaken->start.index == RA_INTER_RANGE_START); + cemu_assert_debug(subrange->subrangeBranchNotTaken->interval2.ExtendsPreviousSegment()); } } } @@ -976,19 +1395,33 @@ raLivenessRange* PPCRecRA_convertToMappedRanges(IMLRegisterAllocatorContext& ctx } // for subranges which exit the segment at the end there is a hard requirement that they cover the suffix instruction // this is due to range load instructions being inserted before the suffix instruction - if (subrange->end.index == RA_INTER_RANGE_END) - { - if (imlSegment->HasSuffixInstruction()) - { - cemu_assert_debug(subrange->start.index <= imlSegment->GetSuffixInstructionIndex()); - } - } + // todo - currently later steps might break this assumption, look into this + // if (subrange->interval2.ExtendsIntoNextSegment()) + // { + // if (imlSegment->HasSuffixInstruction()) + // { + // cemu_assert_debug(subrange->interval2.start.GetInstructionIndex() <= imlSegment->GetSuffixInstructionIndex()); + // } + // } return subrange; } // take abstract range data and create LivenessRanges void IMLRA_ConvertAbstractToLivenessRanges(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment) { + const std::unordered_map& regToSubrange = IMLRA_GetSubrangeMap(imlSegment); + + auto AddOrUpdateFixedRegRequirement = [&](IMLRegID regId, sint32 instructionIndex, bool isInput, const IMLPhysRegisterSet& physRegSet) + { + raLivenessRange* subrange = regToSubrange.find(regId)->second; + cemu_assert_debug(subrange); + raFixedRegRequirement tmp; + tmp.pos.Set(instructionIndex, isInput); + tmp.allowedReg = physRegSet; + if(subrange->list_fixedRegRequirements.empty() || subrange->list_fixedRegRequirements.back().pos != tmp.pos) + subrange->list_fixedRegRequirements.push_back(tmp); + }; + // convert abstract min-max ranges to liveness range objects auto& segMap = ctx.GetSegmentAbstractRangeMap(imlSegment); for (auto& it : segMap) @@ -1001,7 +1434,6 @@ void IMLRA_ConvertAbstractToLivenessRanges(IMLRegisterAllocatorContext& ctx, IML // fill created ranges with read/write location indices // note that at this point there is only one range per register per segment // and the algorithm below relies on this - const std::unordered_map& regToSubrange = IMLRA_GetSubrangeMap(imlSegment); size_t index = 0; IMLUsedRegisters gprTracking; while (index < imlSegment->imlList.size()) @@ -1011,16 +1443,20 @@ void IMLRA_ConvertAbstractToLivenessRanges(IMLRegisterAllocatorContext& ctx, IML IMLRegID gprId = gprReg.GetRegID(); raLivenessRange* subrange = regToSubrange.find(gprId)->second; PPCRecRA_updateOrAddSubrangeLocation(subrange, index, !isWritten, isWritten); -#ifdef CEMU_DEBUG_ASSERT - if ((sint32)index < subrange->start.index) - { - IMLRARegAbstractLiveness* dbgAbstractRange = _GetAbstractRange(ctx, imlSegment, gprId); - assert_dbg(); - } - if ((sint32)index + 1 > subrange->end.index) - assert_dbg(); -#endif + cemu_assert_debug(!subrange->interval2.start.IsInstructionIndex() || subrange->interval2.start.GetInstructionIndex() <= index); + cemu_assert_debug(!subrange->interval2.end.IsInstructionIndex() || subrange->interval2.end.GetInstructionIndex() >= index); }); + // check fixed register requirements + IMLFixedRegisters fixedRegs; + GetInstructionFixedRegisters(&imlSegment->imlList[index], fixedRegs); + for(auto& fixedRegAccess : fixedRegs.listInput) + { + AddOrUpdateFixedRegRequirement(fixedRegAccess.reg.GetRegID(), index, true, fixedRegAccess.physRegSet); + } + for(auto& fixedRegAccess : fixedRegs.listOutput) + { + AddOrUpdateFixedRegRequirement(fixedRegAccess.reg.GetRegID(), index, false, fixedRegAccess.physRegSet); + } index++; } } @@ -1190,7 +1626,7 @@ void PPCRecRA_followFlowAndExtendRanges(IMLRegisterAllocatorContext& ctx, IMLSeg } } -void IMLRA_mergeCloseAbstractRanges(IMLRegisterAllocatorContext& ctx) +void IMLRA_MergeCloseAbstractRanges(IMLRegisterAllocatorContext& ctx) { for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++) { @@ -1201,7 +1637,7 @@ void IMLRA_mergeCloseAbstractRanges(IMLRegisterAllocatorContext& ctx) } } -void IMLRA_extendAbstracRangesOutOfLoops(IMLRegisterAllocatorContext& ctx) +void IMLRA_ExtendAbstractRangesOutOfLoops(IMLRegisterAllocatorContext& ctx) { for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++) { @@ -1238,15 +1674,15 @@ void IMLRA_extendAbstracRangesOutOfLoops(IMLRegisterAllocatorContext& ctx) void IMLRA_ProcessFlowAndCalculateLivenessRanges(IMLRegisterAllocatorContext& ctx) { - IMLRA_mergeCloseAbstractRanges(ctx); - // extra pass to move register stores out of loops - IMLRA_extendAbstracRangesOutOfLoops(ctx); + IMLRA_MergeCloseAbstractRanges(ctx); + // extra pass to move register loads and stores out of loops + IMLRA_ExtendAbstractRangesOutOfLoops(ctx); // calculate liveness ranges for (auto& segIt : ctx.deprGenContext->segmentList2) IMLRA_ConvertAbstractToLivenessRanges(ctx, segIt); } -void PPCRecRA_analyzeSubrangeDataDependencyV2(raLivenessRange* subrange) +void IMLRA_AnalyzeSubrangeDataDependency(raLivenessRange* subrange) { bool isRead = false; bool isWritten = false; @@ -1267,7 +1703,7 @@ void PPCRecRA_analyzeSubrangeDataDependencyV2(raLivenessRange* subrange) subrange->_noLoad = isOverwritten; subrange->hasStore = isWritten; - if (subrange->start.index == RA_INTER_RANGE_START) + if (subrange->interval2.ExtendsPreviousSegment()) subrange->_noLoad = true; } @@ -1294,7 +1730,7 @@ void _findSubrangeWriteEndings(raLivenessRange* subrange, uint32 iterationIndex, if (subrange->hasStoreDelayed) return; // no need to traverse this subrange IMLSegment* imlSegment = subrange->imlSegment; - if (subrange->end.index != RA_INTER_RANGE_END) + if (!subrange->interval2.ExtendsIntoNextSegment()) { // ending segment if (info->subrangeCount >= SUBRANGE_LIST_SIZE) @@ -1335,9 +1771,9 @@ void _findSubrangeWriteEndings(raLivenessRange* subrange, uint32 iterationIndex, } } -static void _analyzeRangeDataFlow(raLivenessRange* subrange) +static void IMLRA_AnalyzeRangeDataFlow(raLivenessRange* subrange) { - if (subrange->end.index != RA_INTER_RANGE_END) + if (!subrange->interval2.ExtendsIntoNextSegment()) return; // analyze data flow across segments (if this segment has writes) if (subrange->hasStore) @@ -1381,47 +1817,312 @@ static void _analyzeRangeDataFlow(raLivenessRange* subrange) void IMLRA_AnalyzeRangeDataFlow(ppcImlGenContext_t* ppcImlGenContext) { - // this function is called after _assignRegisters(), which means that all liveness ranges are already final and must not be changed anymore - // in the first pass we track read/write dependencies + // this function is called after _AssignRegisters(), which means that all liveness ranges are already final and must not be modified anymore + // track read/write dependencies per segment for(auto& seg : ppcImlGenContext->segmentList2) { raLivenessRange* subrange = seg->raInfo.linkedList_allSubranges; while(subrange) { - PPCRecRA_analyzeSubrangeDataDependencyV2(subrange); + IMLRA_AnalyzeSubrangeDataDependency(subrange); subrange = subrange->link_allSegmentRanges.next; } } - // then we do a second pass where we scan along subrange flow + // propagate information across segment boundaries for(auto& seg : ppcImlGenContext->segmentList2) { raLivenessRange* subrange = seg->raInfo.linkedList_allSubranges; while(subrange) { - _analyzeRangeDataFlow(subrange); + IMLRA_AnalyzeRangeDataFlow(subrange); subrange = subrange->link_allSegmentRanges.next; } } } +/* Generate move instructions */ + +inline IMLReg _MakeNativeReg(IMLRegFormat baseFormat, IMLRegID regId) +{ + return IMLReg(baseFormat, baseFormat, 0, regId); +} + +#define DEBUG_RA_INSTRUCTION_GEN 0 + +// prepass for IMLRA_GenerateSegmentMoveInstructions which updates all virtual registers to their physical counterparts +void IMLRA_RewriteRegisters(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment) +{ + std::unordered_map virtId2PhysReg; + boost::container::small_vector activeRanges; + raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; + raInstructionEdge currentEdge; + for(size_t i=0; iimlList.size(); i++) + { + currentEdge.Set(i, false); // set to instruction index on output edge + // activate ranges which begin before or during this instruction + while(currentRange && currentRange->interval2.start <= currentEdge) + { + cemu_assert_debug(virtId2PhysReg.find(currentRange->GetVirtualRegister()) == virtId2PhysReg.end() || virtId2PhysReg[currentRange->GetVirtualRegister()] == currentRange->GetPhysicalRegister()); // check for register conflict + + virtId2PhysReg[currentRange->GetVirtualRegister()] = currentRange->GetPhysicalRegister(); + activeRanges.push_back(currentRange); + currentRange = currentRange->link_allSegmentRanges.next; + } + // rewrite registers + imlSegment->imlList[i].RewriteGPR(virtId2PhysReg); + // deactivate ranges which end during this instruction + auto it = activeRanges.begin(); + while(it != activeRanges.end()) + { + if((*it)->interval2.end <= currentEdge) + { + virtId2PhysReg.erase((*it)->GetVirtualRegister()); + it = activeRanges.erase(it); + } + else + ++it; + } + } +} + +void IMLRA_GenerateSegmentMoveInstructions2(IMLRegisterAllocatorContext& ctx, IMLSegment* imlSegment) +{ + IMLRA_RewriteRegisters(ctx, imlSegment); + +#if DEBUG_RA_INSTRUCTION_GEN + cemuLog_log(LogType::Force, ""); + cemuLog_log(LogType::Force, "[Seg before RA]"); + IMLDebug_DumpSegment(nullptr, imlSegment, true); +#endif + + bool hadSuffixInstruction = imlSegment->HasSuffixInstruction(); + + std::vector rebuiltInstructions; + sint32 numInstructionsWithoutSuffix = (sint32)imlSegment->imlList.size() - (imlSegment->HasSuffixInstruction() ? 1 : 0); + + if(imlSegment->imlList.empty()) + { + // empty segments need special handling (todo - look into merging this with the core logic below eventually) + // store all ranges + raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; + while(currentRange) + { + if(currentRange->hasStore) + rebuiltInstructions.emplace_back().make_name_r(currentRange->GetName(), _MakeNativeReg(ctx.regIdToBaseFormat[currentRange->GetVirtualRegister()], currentRange->GetPhysicalRegister())); + currentRange = currentRange->link_allSegmentRanges.next; + } + // load ranges + currentRange = imlSegment->raInfo.linkedList_allSubranges; + while(currentRange) + { + if(!currentRange->_noLoad) + { + cemu_assert_debug(currentRange->interval2.ExtendsIntoNextSegment()); + rebuiltInstructions.emplace_back().make_r_name(_MakeNativeReg(ctx.regIdToBaseFormat[currentRange->GetVirtualRegister()], currentRange->GetPhysicalRegister()), currentRange->GetName()); + } + currentRange = currentRange->link_allSegmentRanges.next; + } + imlSegment->imlList = std::move(rebuiltInstructions); + return; + } + + // make sure that no range exceeds the suffix instruction input edge except if they need to be loaded for the next segment (todo - for those, set the start point accordingly?) + { + raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; + raInstructionEdge edge; + if(imlSegment->HasSuffixInstruction()) + edge.Set(numInstructionsWithoutSuffix, true); + else + edge.Set(numInstructionsWithoutSuffix-1, false); + + while(currentRange) + { + if(!currentRange->interval2.IsNextSegmentOnly() && currentRange->interval2.end > edge) + { + currentRange->interval2.SetEnd(edge); + } + currentRange = currentRange->link_allSegmentRanges.next; + } + } + +#if DEBUG_RA_INSTRUCTION_GEN + cemuLog_log(LogType::Force, ""); + cemuLog_log(LogType::Force, "--- Intermediate liveness info ---"); + { + raLivenessRange* dbgRange = imlSegment->raInfo.linkedList_allSubranges; + while(dbgRange) + { + cemuLog_log(LogType::Force, "Range i{}: {}-{}", dbgRange->GetVirtualRegister(), dbgRange->interval2.start.GetDebugString(), dbgRange->interval2.end.GetDebugString()); + dbgRange = dbgRange->link_allSegmentRanges.next; + } + } +#endif + + boost::container::small_vector activeRanges; + // first we add all the ranges that extend from the previous segment, some of these will end immediately at the first instruction so we might need to store them early + raLivenessRange* currentRange = imlSegment->raInfo.linkedList_allSubranges; + + // make all ranges active that start on RA_INTER_RANGE_START + while(currentRange && currentRange->interval2.start.ConnectsToPreviousSegment()) + { + activeRanges.push_back(currentRange); + currentRange = currentRange->link_allSegmentRanges.next; + } + // store all ranges that end before the first output edge (includes RA_INTER_RANGE_START) + auto it = activeRanges.begin(); + raInstructionEdge firstOutputEdge; + firstOutputEdge.Set(0, false); + while(it != activeRanges.end()) + { + if( (*it)->interval2.end < firstOutputEdge) + { + raLivenessRange* storedRange = *it; + if(storedRange->hasStore) + rebuiltInstructions.emplace_back().make_name_r(storedRange->GetName(), _MakeNativeReg(ctx.regIdToBaseFormat[storedRange->GetVirtualRegister()], storedRange->GetPhysicalRegister())); + it = activeRanges.erase(it); + continue; + } + ++it; + } + + sint32 numInstructions = (sint32)imlSegment->imlList.size(); + for(sint32 i=0; iinterval2.start <= curEdge) + { + if(!currentRange->_noLoad) + { + rebuiltInstructions.emplace_back().make_r_name(_MakeNativeReg(ctx.regIdToBaseFormat[currentRange->GetVirtualRegister()], currentRange->GetPhysicalRegister()), currentRange->GetName()); + } + activeRanges.push_back(currentRange); + currentRange = currentRange->link_allSegmentRanges.next; + } + // copy instruction + rebuiltInstructions.push_back(imlSegment->imlList[i]); + // output edge + curEdge.SetRaw(i*2+1+1); + // also store ranges that end on the next input edge, we handle this by adding an extra 1 above + auto it = activeRanges.begin(); + while(it != activeRanges.end()) + { + if( (*it)->interval2.end <= curEdge) + { + // range expires + // we cant erase it from virtId2PhysReg right away because a store might happen before the last use (the +1 thing above) + + + // todo - check hasStore + raLivenessRange* storedRange = *it; + if(storedRange->hasStore) + { + cemu_assert_debug(i != numInstructionsWithoutSuffix); // not allowed to emit after suffix + rebuiltInstructions.emplace_back().make_name_r(storedRange->GetName(), _MakeNativeReg(ctx.regIdToBaseFormat[storedRange->GetVirtualRegister()], storedRange->GetPhysicalRegister())); + } + + it = activeRanges.erase(it); + continue; + } + ++it; + } + } + // if there is no suffix instruction we currently need to handle the final loads here + cemu_assert_debug(hadSuffixInstruction == imlSegment->HasSuffixInstruction()); + if(imlSegment->HasSuffixInstruction()) + { + cemu_assert_debug(!currentRange); // currentRange should be NULL? + for(auto& remainingRange : activeRanges) + { + cemu_assert_debug(!remainingRange->hasStore); + } + } + else + { + for(auto& remainingRange : activeRanges) + { + cemu_assert_debug(!remainingRange->hasStore); // this range still needs to be stored + } + while(currentRange) + { + cemu_assert_debug(currentRange->interval2.IsNextSegmentOnly()); + cemu_assert_debug(!currentRange->_noLoad); + rebuiltInstructions.emplace_back().make_r_name(_MakeNativeReg(ctx.regIdToBaseFormat[currentRange->GetVirtualRegister()], currentRange->GetPhysicalRegister()), currentRange->GetName()); + currentRange = currentRange->link_allSegmentRanges.next; + } + } + + imlSegment->imlList = std::move(rebuiltInstructions); + cemu_assert_debug(hadSuffixInstruction == imlSegment->HasSuffixInstruction()); + +#if DEBUG_RA_INSTRUCTION_GEN + cemuLog_log(LogType::Force, ""); + cemuLog_log(LogType::Force, "[Seg after RA]"); + IMLDebug_DumpSegment(nullptr, imlSegment, false); +#endif +} + +void IMLRA_GenerateMoveInstructions(IMLRegisterAllocatorContext& ctx) +{ + for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++) + { + IMLSegment* imlSegment = ctx.deprGenContext->segmentList2[s]; + IMLRA_GenerateSegmentMoveInstructions2(ctx, imlSegment); + } +} + +void DbgVerifyAllRanges(IMLRegisterAllocatorContext& ctx) +{ + for (size_t s = 0; s < ctx.deprGenContext->segmentList2.size(); s++) + { + IMLSegment* imlSegment = ctx.deprGenContext->segmentList2[s]; + raLivenessRange* subrangeItr = imlSegment->raInfo.linkedList_allSubranges; + while(subrangeItr) + { + PPCRecRA_debugValidateSubrange(subrangeItr); + subrangeItr = subrangeItr->link_allSegmentRanges.next; + } + } +} + void IMLRegisterAllocator_AllocateRegisters(ppcImlGenContext_t* ppcImlGenContext, IMLRegisterAllocatorParameters& raParam) { IMLRegisterAllocatorContext ctx; ctx.raParam = &raParam; ctx.deprGenContext = ppcImlGenContext; + DbgVerifyAllRanges(ctx); // DEBUG + IMLRA_ReshapeForRegisterAllocation(ppcImlGenContext); + DbgVerifyAllRanges(ctx); // DEBUG + ppcImlGenContext->UpdateSegmentIndices(); // update momentaryIndex of each segment + DbgVerifyAllRanges(ctx); // DEBUG ctx.perSegmentAbstractRanges.resize(ppcImlGenContext->segmentList2.size()); IMLRA_CalculateLivenessRanges(ctx); + DbgVerifyAllRanges(ctx); // DEBUG IMLRA_ProcessFlowAndCalculateLivenessRanges(ctx); + DbgVerifyAllRanges(ctx); // DEBUG IMLRA_AssignRegisters(ctx, ppcImlGenContext); + DbgVerifyAllRanges(ctx); // DEBUG + + // debug print + //IMLDebug_Dump(ppcImlGenContext, true); + + // debug print + // if (ppcImlGenContext->debug_entryPPCAddress == 0x2BDA9F4) + // { + // IMLDebug_Dump(ppcImlGenContext, true); + // __debugbreak(); + // } IMLRA_AnalyzeRangeDataFlow(ppcImlGenContext); IMLRA_GenerateMoveInstructions(ctx); + PPCRecRA_deleteAllRanges(ppcImlGenContext); } diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h index 52b20397..9e5573a6 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocator.h @@ -1,6 +1,7 @@ +#pragma once // container for storing a set of register indices -// specifically optimized towards storing physical register indices (expected to be below 64) +// specifically optimized towards storing typical range of physical register indices (expected to be below 64) class IMLPhysRegisterSet { public: @@ -33,11 +34,21 @@ public: return *this; } + void RemoveRegisters(const IMLPhysRegisterSet& other) + { + this->m_regBitmask &= ~other.m_regBitmask; + } + bool HasAnyAvailable() const { return m_regBitmask != 0; } + bool HasExactlyOneAvailable() const + { + return m_regBitmask != 0 && (m_regBitmask & (m_regBitmask - 1)) == 0; + } + // returns index of first available register. Do not call when HasAnyAvailable() == false uint32 GetFirstAvailableReg() { @@ -59,7 +70,7 @@ public: // returns index of next available register (search includes any register index >= startIndex) // returns -1 if there is no more register - sint32 GetNextAvailableReg(sint32 startIndex) + sint32 GetNextAvailableReg(sint32 startIndex) const { if (startIndex >= 64) return -1; @@ -81,6 +92,11 @@ public: return regIndex; } + sint32 CountAvailableRegs() const + { + return std::popcount(m_regBitmask); + } + private: uint64 m_regBitmask{ 0 }; }; diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp index 602cdfa7..e58b7888 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.cpp @@ -22,7 +22,6 @@ IMLName raLivenessRange::GetName() const void raLivenessRange::SetPhysicalRegister(sint32 physicalRegister) { - cemu_assert_suspicious(); // not used yet this->physicalRegister = physicalRegister; } @@ -68,6 +67,58 @@ boost::container::small_vector raLivenessRange::GetAllSubr return subranges; } +bool raLivenessRange::GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters) +{ + if(interval2.ExtendsPreviousSegment() || interval2.ExtendsIntoNextSegment()) + { + auto clusterRanges = GetAllSubrangesInCluster(); + bool hasAnyRequirement = false; + for(auto& subrange : clusterRanges) + { + if(subrange->list_fixedRegRequirements.empty()) + continue; + allowedRegisters = subrange->list_fixedRegRequirements.front().allowedReg; + hasAnyRequirement = true; + break; + } + if(!hasAnyRequirement) + return false; + for(auto& subrange : clusterRanges) + { + for(auto& fixedRegLoc : subrange->list_fixedRegRequirements) + allowedRegisters &= fixedRegLoc.allowedReg; + } + } + else + { + // local check only, slightly faster + if(list_fixedRegRequirements.empty()) + return false; + allowedRegisters = list_fixedRegRequirements.front().allowedReg; + for(auto& fixedRegLoc : list_fixedRegRequirements) + allowedRegisters &= fixedRegLoc.allowedReg; + } + return true; +} + +IMLPhysRegisterSet raLivenessRange::GetAllowedRegisters(IMLPhysRegisterSet regPool) +{ + IMLPhysRegisterSet fixedRegRequirements = regPool; + if(interval2.ExtendsPreviousSegment() || interval2.ExtendsIntoNextSegment()) + { + auto clusterRanges = GetAllSubrangesInCluster(); + for(auto& subrange : clusterRanges) + { + for(auto& fixedRegLoc : subrange->list_fixedRegRequirements) + fixedRegRequirements &= fixedRegLoc.allowedReg; + } + return fixedRegRequirements; + } + for(auto& fixedRegLoc : list_fixedRegRequirements) + fixedRegRequirements &= fixedRegLoc.allowedReg; + return fixedRegRequirements; +} + void PPCRecRARange_addLink_perVirtualGPR(std::unordered_map& root, raLivenessRange* subrange) { IMLRegID regId = subrange->GetVirtualRegister(); @@ -142,14 +193,19 @@ void PPCRecRARange_removeLink_allSegmentRanges(raLivenessRange** root, raLivenes MemoryPoolPermanentObjects memPool_livenessSubrange(4096); -raLivenessRange* PPCRecRA_createSubrange(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, sint32 startIndex, sint32 endIndex) +// startPosition and endPosition are inclusive +raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition) { raLivenessRange* range = memPool_livenessSubrange.acquireObj(); range->previousRanges.clear(); - range->list_locations.resize(0); + range->list_locations.clear(); + range->list_fixedRegRequirements.clear(); range->imlSegment = imlSegment; - PPCRecompilerIml_setSegmentPoint(&range->start, imlSegment, startIndex); - PPCRecompilerIml_setSegmentPoint(&range->end, imlSegment, endIndex); + + cemu_assert_debug(startPosition <= endPosition); + range->interval2.start = startPosition; + range->interval2.end = endPosition; + // register mapping range->virtualRegister = virtualRegister; range->name = name; @@ -160,6 +216,7 @@ raLivenessRange* PPCRecRA_createSubrange(ppcImlGenContext_t* ppcImlGenContext, I range->lastIterationIndex = 0; range->subrangeBranchNotTaken = nullptr; range->subrangeBranchTaken = nullptr; + cemu_assert_debug(range->previousRanges.empty()); range->_noLoad = false; // add to segment linked lists PPCRecRARange_addLink_perVirtualGPR(imlSegment->raInfo.linkedList_perVirtualRegister, range); @@ -172,6 +229,22 @@ void _unlinkSubrange(raLivenessRange* subrange) IMLSegment* imlSegment = subrange->imlSegment; PPCRecRARange_removeLink_perVirtualGPR(imlSegment->raInfo.linkedList_perVirtualRegister, subrange); PPCRecRARange_removeLink_allSegmentRanges(&imlSegment->raInfo.linkedList_allSubranges, subrange); + // unlink reverse references + if(subrange->subrangeBranchTaken) + subrange->subrangeBranchTaken->previousRanges.erase(std::find(subrange->subrangeBranchTaken->previousRanges.begin(), subrange->subrangeBranchTaken->previousRanges.end(), subrange)); + if(subrange->subrangeBranchNotTaken) + subrange->subrangeBranchNotTaken->previousRanges.erase(std::find(subrange->subrangeBranchNotTaken->previousRanges.begin(), subrange->subrangeBranchNotTaken->previousRanges.end(), subrange)); + subrange->subrangeBranchTaken = (raLivenessRange*)(uintptr_t)-1; + subrange->subrangeBranchNotTaken = (raLivenessRange*)(uintptr_t)-1; + // remove forward references + for(auto& prev : subrange->previousRanges) + { + if(prev->subrangeBranchTaken == subrange) + prev->subrangeBranchTaken = nullptr; + if(prev->subrangeBranchNotTaken == subrange) + prev->subrangeBranchNotTaken = nullptr; + } + subrange->previousRanges.clear(); } void PPCRecRA_deleteSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange) @@ -179,14 +252,9 @@ void PPCRecRA_deleteSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRan _unlinkSubrange(subrange); //subrange->range->list_subranges.erase(std::find(subrange->range->list_subranges.begin(), subrange->range->list_subranges.end(), subrange)); subrange->list_locations.clear(); - // unlink reverse references - if(subrange->subrangeBranchTaken) - subrange->subrangeBranchTaken->previousRanges.erase(std::find(subrange->subrangeBranchTaken->previousRanges.begin(), subrange->subrangeBranchTaken->previousRanges.end(), subrange)); - if(subrange->subrangeBranchNotTaken) - subrange->subrangeBranchTaken->previousRanges.erase(std::find(subrange->subrangeBranchNotTaken->previousRanges.begin(), subrange->subrangeBranchNotTaken->previousRanges.end(), subrange)); - PPCRecompilerIml_removeSegmentPoint(&subrange->start); - PPCRecompilerIml_removeSegmentPoint(&subrange->end); + //PPCRecompilerIml_removeSegmentPoint(&subrange->interval.start); + //PPCRecompilerIml_removeSegmentPoint(&subrange->interval.end); memPool_livenessSubrange.releaseObj(subrange); } @@ -194,9 +262,18 @@ void PPCRecRA_deleteSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRan void _PPCRecRA_deleteSubrangeNoUnlink(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange) { _unlinkSubrange(subrange); - PPCRecompilerIml_removeSegmentPoint(&subrange->start); - PPCRecompilerIml_removeSegmentPoint(&subrange->end); + //PPCRecompilerIml_removeSegmentPoint(&subrange->interval.start); + //PPCRecompilerIml_removeSegmentPoint(&subrange->interval.end); memPool_livenessSubrange.releaseObj(subrange); + +// #ifdef CEMU_DEBUG_ASSERT +// // DEBUG BEGIN +// subrange->lastIterationIndex = 0xFFFFFFFE; +// subrange->subrangeBranchTaken = (raLivenessRange*)(uintptr_t)-1; +// subrange->subrangeBranchNotTaken = (raLivenessRange*)(uintptr_t)-1; +// +// // DEBUG END +// #endif } void PPCRecRA_deleteSubrangeCluster(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange) @@ -229,8 +306,8 @@ void PPCRecRA_mergeSubranges(ppcImlGenContext_t* ppcImlGenContext, raLivenessRan PPCRecRA_debugValidateSubrange(absorbedSubrange); if (subrange->imlSegment != absorbedSubrange->imlSegment) assert_dbg(); - if (subrange->end.index > absorbedSubrange->start.index) - assert_dbg(); + cemu_assert_debug(subrange->interval2.end == absorbedSubrange->interval2.start); + if (subrange->subrangeBranchTaken || subrange->subrangeBranchNotTaken) assert_dbg(); if (subrange == absorbedSubrange) @@ -238,21 +315,45 @@ void PPCRecRA_mergeSubranges(ppcImlGenContext_t* ppcImlGenContext, raLivenessRan #endif // update references - if(absorbedSubrange->subrangeBranchTaken) - *std::find(absorbedSubrange->subrangeBranchTaken->previousRanges.begin(), absorbedSubrange->subrangeBranchTaken->previousRanges.end(), absorbedSubrange) = subrange; - if(absorbedSubrange->subrangeBranchNotTaken) - *std::find(absorbedSubrange->subrangeBranchNotTaken->previousRanges.begin(), absorbedSubrange->subrangeBranchNotTaken->previousRanges.end(), absorbedSubrange) = subrange; subrange->subrangeBranchTaken = absorbedSubrange->subrangeBranchTaken; subrange->subrangeBranchNotTaken = absorbedSubrange->subrangeBranchNotTaken; + absorbedSubrange->subrangeBranchTaken = nullptr; + absorbedSubrange->subrangeBranchNotTaken = nullptr; + if(subrange->subrangeBranchTaken) + *std::find(subrange->subrangeBranchTaken->previousRanges.begin(), subrange->subrangeBranchTaken->previousRanges.end(), absorbedSubrange) = subrange; + if(subrange->subrangeBranchNotTaken) + *std::find(subrange->subrangeBranchNotTaken->previousRanges.begin(), subrange->subrangeBranchNotTaken->previousRanges.end(), absorbedSubrange) = subrange; // merge usage locations + // at the merge point both ranges might track the same instruction, we handle this by first merging this duplicate location + if(subrange && absorbedSubrange && !subrange->list_locations.empty() && !absorbedSubrange->list_locations.empty()) + { + if(subrange->list_locations.back().index == absorbedSubrange->list_locations.front().index) + { + subrange->list_locations.back().isRead |= absorbedSubrange->list_locations.front().isRead; + subrange->list_locations.back().isWrite |= absorbedSubrange->list_locations.front().isWrite; + absorbedSubrange->list_locations.erase(absorbedSubrange->list_locations.begin()); // inefficient + } + } for (auto& location : absorbedSubrange->list_locations) { + cemu_assert_debug(subrange->list_locations.empty() || (subrange->list_locations.back().index < location.index)); // todo - sometimes a subrange can contain the same instruction at the merge point if they are covering half of the instruction edge subrange->list_locations.push_back(location); } absorbedSubrange->list_locations.clear(); + // merge fixed reg locations +#ifdef CEMU_DEBUG_ASSERT + if(!subrange->list_fixedRegRequirements.empty() && !absorbedSubrange->list_fixedRegRequirements.empty()) + { + cemu_assert_debug(subrange->list_fixedRegRequirements.back().pos < absorbedSubrange->list_fixedRegRequirements.front().pos); + } +#endif + for (auto& fixedReg : absorbedSubrange->list_fixedRegRequirements) + { + subrange->list_fixedRegRequirements.push_back(fixedReg); + } - subrange->end.index = absorbedSubrange->end.index; + subrange->interval2.end = absorbedSubrange->interval2.end; PPCRecRA_debugValidateSubrange(subrange); @@ -262,16 +363,21 @@ void PPCRecRA_mergeSubranges(ppcImlGenContext_t* ppcImlGenContext, raLivenessRan // remove all inter-segment connections from the range cluster and split it into local ranges (also removes empty ranges) void PPCRecRA_explodeRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* originRange) { + cemu_assert_debug(originRange->interval2.ExtendsPreviousSegment() || originRange->interval2.ExtendsIntoNextSegment()); // only call this on ranges that span multiple segments auto clusterRanges = originRange->GetAllSubrangesInCluster(); for (auto& subrange : clusterRanges) { if (subrange->list_locations.empty()) continue; - raLivenessRange* newSubrange = PPCRecRA_createSubrange(ppcImlGenContext, subrange->imlSegment, subrange->GetVirtualRegister(), subrange->GetName(), subrange->list_locations.data()[0].index, subrange->list_locations.data()[subrange->list_locations.size() - 1].index + 1); - // copy locations - for (auto& location : subrange->list_locations) + raInterval interval; + interval.SetInterval(subrange->list_locations.front().index, true, subrange->list_locations.back().index, true); + raLivenessRange* newSubrange = PPCRecRA_createSubrange2(ppcImlGenContext, subrange->imlSegment, subrange->GetVirtualRegister(), subrange->GetName(), interval.start, interval.end); + // copy locations and fixed reg indices + newSubrange->list_locations = subrange->list_locations; + newSubrange->list_fixedRegRequirements = subrange->list_fixedRegRequirements; + if(originRange->HasPhysicalRegister()) { - newSubrange->list_locations.push_back(location); + cemu_assert_debug(subrange->list_fixedRegRequirements.empty()); // avoid unassigning a register from a range with a fixed register requirement } } // remove subranges @@ -279,82 +385,223 @@ void PPCRecRA_explodeRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange } #ifdef CEMU_DEBUG_ASSERT -void PPCRecRA_debugValidateSubrange(raLivenessRange* subrange) +void PPCRecRA_debugValidateSubrange(raLivenessRange* range) { // validate subrange - if (subrange->subrangeBranchTaken && subrange->subrangeBranchTaken->imlSegment != subrange->imlSegment->nextSegmentBranchTaken) + if (range->subrangeBranchTaken && range->subrangeBranchTaken->imlSegment != range->imlSegment->nextSegmentBranchTaken) assert_dbg(); - if (subrange->subrangeBranchNotTaken && subrange->subrangeBranchNotTaken->imlSegment != subrange->imlSegment->nextSegmentBranchNotTaken) + if (range->subrangeBranchNotTaken && range->subrangeBranchNotTaken->imlSegment != range->imlSegment->nextSegmentBranchNotTaken) assert_dbg(); + + if(range->subrangeBranchTaken || range->subrangeBranchNotTaken) + { + cemu_assert_debug(range->interval2.end.ConnectsToNextSegment()); + } + if(!range->previousRanges.empty()) + { + cemu_assert_debug(range->interval2.start.ConnectsToPreviousSegment()); + } + // validate locations + if (!range->list_locations.empty()) + { + cemu_assert_debug(range->list_locations.front().index >= range->interval2.start.GetInstructionIndexEx()); + cemu_assert_debug(range->list_locations.back().index <= range->interval2.end.GetInstructionIndexEx()); + } + } #else -void PPCRecRA_debugValidateSubrange(raLivenessSubrange_t* subrange) {} +void PPCRecRA_debugValidateSubrange(raLivenessRange* range) {} #endif -// split subrange at the given index -// After the split there will be two ranges and subranges: +// since locations are per-instruction, but intervals are per-edge, it's possible that locations track reads/writes outside of the range +// this function will remove any outside read/write locations +void IMLRA_FixLocations(raLivenessRange* range) +{ + if(range->list_locations.empty()) + return; + if(range->interval2.start.IsInstructionIndex() && range->interval2.start.GetInstructionIndex() == range->list_locations.front().index) + { + auto& location = range->list_locations.front(); + if(range->interval2.start.IsOnOutputEdge()) + { + location.isRead = false; + if(!location.isRead && !location.isWrite) + range->list_locations.erase(range->list_locations.begin()); + } + } + if(range->list_locations.empty()) + return; + if(range->interval2.end.IsInstructionIndex() && range->interval2.end.GetInstructionIndex() == range->list_locations.back().index) + { + auto& location = range->list_locations.back(); + if(range->interval2.end.IsOnInputEdge()) + { + location.isWrite = false; + if(!location.isRead && !location.isWrite) + range->list_locations.pop_back(); + } + } +} + +// trim start and end of range to match first and last read/write locations +// does not trim start/endpoints which extend into the next/previous segment +void IMLRA_TrimRangeToUse(raLivenessRange* range) +{ + if(range->list_locations.empty()) + { + // special case where we trim ranges extending from other segments to a single instruction edge + cemu_assert_debug(!range->interval2.start.IsInstructionIndex() || !range->interval2.end.IsInstructionIndex()); + if(range->interval2.start.IsInstructionIndex()) + range->interval2.start = range->interval2.end; + if(range->interval2.end.IsInstructionIndex()) + range->interval2.end = range->interval2.start; + return; + } + raInterval prevInterval = range->interval2; + // trim start + if(range->interval2.start.IsInstructionIndex()) + { + bool isInputEdge = range->list_locations.front().isRead; + range->interval2.start.Set(range->list_locations.front().index, isInputEdge); + } + // trim end + if(range->interval2.end.IsInstructionIndex()) + { + bool isOutputEdge = range->list_locations.back().isWrite; + range->interval2.end.Set(range->list_locations.back().index, !isOutputEdge); + } + // extra checks +#ifdef CEMU_DEBUG_ASSERT + cemu_assert_debug(range->interval2.start <= range->interval2.end); + for(auto& loc : range->list_locations) + { + cemu_assert_debug(range->interval2.ContainsInstructionIndex(loc.index)); + } + cemu_assert_debug(prevInterval.ContainsWholeInterval(range->interval2)); +#endif +} + +// split range at the given position +// After the split there will be two ranges: // head -> subrange is shortened to end at splitIndex (exclusive) // tail -> a new subrange that ranges from splitIndex (inclusive) to the end of the original subrange // if head has a physical register assigned it will not carry over to tail -// The return value is the tail subrange -// If trimToHole is true, the end of the head subrange and the start of the tail subrange will be moved to fit the locations -// Ranges that begin at RA_INTER_RANGE_START are allowed and can be split -raLivenessRange* PPCRecRA_splitLocalSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange, sint32 splitIndex, bool trimToHole) +// The return value is the tail range +// If trimToHole is true, the end of the head subrange and the start of the tail subrange will be shrunk to fit the read/write locations within them +// the range after the split point does not inherit the physical register +// if trimToHole is true and any of the halfes is empty, it will be deleted +raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange*& subrange, raInstructionEdge splitPosition, bool trimToHole) { - // validation -#ifdef CEMU_DEBUG_ASSERT - //if (subrange->end.index == RA_INTER_RANGE_END || subrange->end.index == RA_INTER_RANGE_START) - // assert_dbg(); - if (subrange->start.index == RA_INTER_RANGE_END || subrange->end.index == RA_INTER_RANGE_START) - assert_dbg(); - if (subrange->start.index >= splitIndex) - assert_dbg(); - if (subrange->end.index <= splitIndex) - assert_dbg(); -#endif + cemu_assert_debug(splitPosition.IsInstructionIndex()); + cemu_assert_debug(!subrange->interval2.IsNextSegmentOnly() && !subrange->interval2.IsPreviousSegmentOnly()); + cemu_assert_debug(subrange->interval2.ContainsEdge(splitPosition)); + // determine new intervals + raInterval headInterval, tailInterval; + headInterval.SetInterval(subrange->interval2.start, splitPosition-1); + tailInterval.SetInterval(splitPosition, subrange->interval2.end); + cemu_assert_debug(headInterval.start <= headInterval.end); + cemu_assert_debug(tailInterval.start <= tailInterval.end); // create tail - raLivenessRange* tailSubrange = PPCRecRA_createSubrange(ppcImlGenContext, subrange->imlSegment, subrange->GetVirtualRegister(), subrange->GetName(), splitIndex, subrange->end.index); - // copy locations + raLivenessRange* tailSubrange = PPCRecRA_createSubrange2(ppcImlGenContext, subrange->imlSegment, subrange->GetVirtualRegister(), subrange->GetName(), tailInterval.start, tailInterval.end); + tailSubrange->SetPhysicalRegister(subrange->GetPhysicalRegister()); + // carry over branch targets and update reverse references + tailSubrange->subrangeBranchTaken = subrange->subrangeBranchTaken; + tailSubrange->subrangeBranchNotTaken = subrange->subrangeBranchNotTaken; + subrange->subrangeBranchTaken = nullptr; + subrange->subrangeBranchNotTaken = nullptr; + if(tailSubrange->subrangeBranchTaken) + *std::find(tailSubrange->subrangeBranchTaken->previousRanges.begin(), tailSubrange->subrangeBranchTaken->previousRanges.end(), subrange) = tailSubrange; + if(tailSubrange->subrangeBranchNotTaken) + *std::find(tailSubrange->subrangeBranchNotTaken->previousRanges.begin(), tailSubrange->subrangeBranchNotTaken->previousRanges.end(), subrange) = tailSubrange; + // we assume that list_locations is ordered by instruction index and contains no duplicate indices, so lets check that here just in case +#ifdef CEMU_DEBUG_ASSERT + if(!subrange->list_locations.empty()) + { + sint32 curIdx = -1; + for(auto& location : subrange->list_locations) + { + cemu_assert_debug(curIdx < location.index); + curIdx = location.index; + } + } +#endif + // split locations + // since there are 2 edges per instruction and locations track both via a single index, locations on the split point might need to be copied into both ranges for (auto& location : subrange->list_locations) { - if (location.index >= splitIndex) + if(tailInterval.ContainsInstructionIndex(location.index)) tailSubrange->list_locations.push_back(location); } // remove tail locations from head for (sint32 i = 0; i < subrange->list_locations.size(); i++) { raLivenessLocation_t* location = subrange->list_locations.data() + i; - if (location->index >= splitIndex) + if (!headInterval.ContainsInstructionIndex(location->index)) { subrange->list_locations.resize(i); break; } } - // adjust start/end - if (trimToHole) + // split fixed reg requirements + for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++) { - if (subrange->list_locations.empty()) + raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i; + if (tailInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex())) { - subrange->end.index = subrange->start.index+1; + tailSubrange->list_fixedRegRequirements.push_back(*fixedReg); + } + } + // remove tail fixed reg requirements from head + for (sint32 i = 0; i < subrange->list_fixedRegRequirements.size(); i++) + { + raFixedRegRequirement* fixedReg = subrange->list_fixedRegRequirements.data() + i; + if (!headInterval.ContainsInstructionIndex(fixedReg->pos.GetInstructionIndex())) + { + subrange->list_fixedRegRequirements.resize(i); + break; + } + } + // adjust intervals + subrange->interval2 = headInterval; + tailSubrange->interval2 = tailInterval; + // fix locations to only include read/write edges within the range + if(subrange) + IMLRA_FixLocations(subrange); + if(tailSubrange) + IMLRA_FixLocations(tailSubrange); + // trim to hole + if(trimToHole) + { + if(subrange->list_locations.empty() && (subrange->interval2.start.IsInstructionIndex() && subrange->interval2.end.IsInstructionIndex())) + { + PPCRecRA_deleteSubrange(ppcImlGenContext, subrange); + subrange = nullptr; } else { - subrange->end.index = subrange->list_locations.back().index + 1; + IMLRA_TrimRangeToUse(subrange); } - if (tailSubrange->list_locations.empty()) + if(tailSubrange->list_locations.empty() && (tailSubrange->interval2.start.IsInstructionIndex() && tailSubrange->interval2.end.IsInstructionIndex())) { - assert_dbg(); // should not happen? (In this case we can just avoid generating a tail at all) + PPCRecRA_deleteSubrange(ppcImlGenContext, tailSubrange); + tailSubrange = nullptr; } else { - tailSubrange->start.index = tailSubrange->list_locations.front().index; + IMLRA_TrimRangeToUse(tailSubrange); } } - else - { - // set head range to end at split index - subrange->end.index = splitIndex; - } + // validation + cemu_assert_debug(!subrange || subrange->interval2.start <= subrange->interval2.end); + cemu_assert_debug(!tailSubrange || tailSubrange->interval2.start <= tailSubrange->interval2.end); + cemu_assert_debug(!tailSubrange || tailSubrange->interval2.start >= splitPosition); + if (!trimToHole) + cemu_assert_debug(!tailSubrange || tailSubrange->interval2.start == splitPosition); + + if(subrange) + PPCRecRA_debugValidateSubrange(subrange); + if(tailSubrange) + PPCRecRA_debugValidateSubrange(tailSubrange); return tailSubrange; } @@ -401,13 +648,13 @@ sint32 PPCRecRARange_estimateTotalCost(std::span ranges) for (auto& subrange : ranges) { - if (subrange->start.index != RA_INTER_RANGE_START) + if (!subrange->interval2.ExtendsPreviousSegment()) { //cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment); mostExpensiveRead = std::max(mostExpensiveRead, PPCRecRARange_getReadWriteCost(subrange->imlSegment)); readCount++; } - if (subrange->end.index != RA_INTER_RANGE_END) + if (!subrange->interval2.ExtendsIntoNextSegment()) { //cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment); mostExpensiveWrite = std::max(mostExpensiveWrite, PPCRecRARange_getReadWriteCost(subrange->imlSegment)); @@ -433,13 +680,14 @@ sint32 PPCRecRARange_estimateCostAfterRangeExplode(raLivenessRange* subrange) return cost; } -sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, sint32 splitIndex) +sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition) { // validation #ifdef CEMU_DEBUG_ASSERT - if (subrange->end.index == RA_INTER_RANGE_END) + if (subrange->interval2.ExtendsIntoNextSegment()) assert_dbg(); #endif + cemu_assert_debug(splitPosition.IsInstructionIndex()); sint32 cost = 0; // find split position in location list @@ -448,25 +696,15 @@ sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, assert_dbg(); // should not happen? return 0; } - if (splitIndex <= subrange->list_locations.front().index) + sint32 splitInstructionIndex = splitPosition.GetInstructionIndex(); + if (splitInstructionIndex <= subrange->list_locations.front().index) return 0; - if (splitIndex > subrange->list_locations.back().index) + if (splitInstructionIndex > subrange->list_locations.back().index) return 0; // todo - determine exact cost of split subranges cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // currently we assume that the additional region will require a read and a store - //for (sint32 f = 0; f < subrange->list_locations.size(); f++) - //{ - // raLivenessLocation_t* location = subrange->list_locations.data() + f; - // if (location->index >= splitIndex) - // { - // ... - // return cost; - // } - //} - return cost; -} - +} \ No newline at end of file diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h index 31deaab3..4467d2f0 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLRegisterAllocatorRanges.h @@ -1,4 +1,5 @@ #pragma once +#include "IMLRegisterAllocator.h" struct raLivenessLocation_t { @@ -18,11 +19,286 @@ struct raLivenessSubrangeLink struct raLivenessRange* next; }; +struct raInstructionEdge +{ + friend struct raInterval; +public: + raInstructionEdge() + { + index = 0; + } + + raInstructionEdge(sint32 instructionIndex, bool isInputEdge) + { + Set(instructionIndex, isInputEdge); + } + + void Set(sint32 instructionIndex, bool isInputEdge) + { + if(instructionIndex == RA_INTER_RANGE_START || instructionIndex == RA_INTER_RANGE_END) + { + index = instructionIndex; + return; + } + index = instructionIndex * 2 + (isInputEdge ? 0 : 1); + cemu_assert_debug(index >= 0 && index < 0x100000*2); // make sure index value is sane + } + + void SetRaw(sint32 index) + { + this->index = index; + cemu_assert_debug(index == RA_INTER_RANGE_START || index == RA_INTER_RANGE_END || (index >= 0 && index < 0x100000*2)); // make sure index value is sane + } + + // sint32 GetRaw() + // { + // this->index = index; + // } + + std::string GetDebugString() + { + if(index == RA_INTER_RANGE_START) + return "RA_START"; + else if(index == RA_INTER_RANGE_END) + return "RA_END"; + std::string str = fmt::format("{}", GetInstructionIndex()); + if(IsOnInputEdge()) + str += "i"; + else if(IsOnOutputEdge()) + str += "o"; + return str; + } + + sint32 GetInstructionIndex() const + { + cemu_assert_debug(index != RA_INTER_RANGE_START && index != RA_INTER_RANGE_END); + return index >> 1; + } + + // returns instruction index or RA_INTER_RANGE_START/RA_INTER_RANGE_END + sint32 GetInstructionIndexEx() const + { + if(index == RA_INTER_RANGE_START || index == RA_INTER_RANGE_END) + return index; + return index >> 1; + } + + sint32 GetRaw() const + { + return index; + } + + bool IsOnInputEdge() const + { + cemu_assert_debug(index != RA_INTER_RANGE_START && index != RA_INTER_RANGE_END); + return (index&1) == 0; + } + + bool IsOnOutputEdge() const + { + cemu_assert_debug(index != RA_INTER_RANGE_START && index != RA_INTER_RANGE_END); + return (index&1) != 0; + } + + bool ConnectsToPreviousSegment() const + { + return index == RA_INTER_RANGE_START; + } + + bool ConnectsToNextSegment() const + { + return index == RA_INTER_RANGE_END; + } + + bool IsInstructionIndex() const + { + return index != RA_INTER_RANGE_START && index != RA_INTER_RANGE_END; + } + + // comparison operators + bool operator>(const raInstructionEdge& other) const + { + return index > other.index; + } + bool operator<(const raInstructionEdge& other) const + { + return index < other.index; + } + bool operator<=(const raInstructionEdge& other) const + { + return index <= other.index; + } + bool operator>=(const raInstructionEdge& other) const + { + return index >= other.index; + } + bool operator==(const raInstructionEdge& other) const + { + return index == other.index; + } + + raInstructionEdge operator+(sint32 offset) const + { + cemu_assert_debug(IsInstructionIndex()); + cemu_assert_debug(offset >= 0 && offset < RA_INTER_RANGE_END); + raInstructionEdge edge; + edge.index = index + offset; + return edge; + } + + raInstructionEdge operator-(sint32 offset) const + { + cemu_assert_debug(IsInstructionIndex()); + cemu_assert_debug(offset >= 0 && offset < RA_INTER_RANGE_END); + raInstructionEdge edge; + edge.index = index - offset; + return edge; + } + + raInstructionEdge& operator++() + { + cemu_assert_debug(IsInstructionIndex()); + index++; + return *this; + } + +private: + sint32 index; // can also be RA_INTER_RANGE_START or RA_INTER_RANGE_END, otherwise contains instruction index * 2 + +}; + +struct raInterval +{ + raInterval() + { + + } + + raInterval(raInstructionEdge start, raInstructionEdge end) + { + SetInterval(start, end); + } + + // isStartOnInput = Input+Output edge on first instruction. If false then only output + // isEndOnOutput = Input+Output edge on last instruction. If false then only input + void SetInterval(sint32 start, bool isStartOnInput, sint32 end, bool isEndOnOutput) + { + this->start.Set(start, isStartOnInput); + this->end.Set(end, !isEndOnOutput); + } + + void SetInterval(raInstructionEdge start, raInstructionEdge end) + { + cemu_assert_debug(start <= end); + this->start = start; + this->end = end; + } + + void SetStart(const raInstructionEdge& edge) + { + start = edge; + } + + void SetEnd(const raInstructionEdge& edge) + { + end = edge; + } + + sint32 GetStartIndex() const + { + return start.GetInstructionIndex(); + } + + sint32 GetEndIndex() const + { + return end.GetInstructionIndex(); + } + + bool ExtendsPreviousSegment() const + { + return start.ConnectsToPreviousSegment(); + } + + bool ExtendsIntoNextSegment() const + { + return end.ConnectsToNextSegment(); + } + + bool IsNextSegmentOnly() const + { + return start.ConnectsToNextSegment() && end.ConnectsToNextSegment(); + } + + bool IsPreviousSegmentOnly() const + { + return start.ConnectsToPreviousSegment() && end.ConnectsToPreviousSegment(); + } + + // returns true if range is contained within a single segment + bool IsLocal() const + { + return start.GetRaw() > RA_INTER_RANGE_START && end.GetRaw() < RA_INTER_RANGE_END; + } + + bool ContainsInstructionIndex(sint32 instructionIndex) const + { + cemu_assert_debug(instructionIndex != RA_INTER_RANGE_START && instructionIndex != RA_INTER_RANGE_END); + return instructionIndex >= start.GetInstructionIndexEx() && instructionIndex <= end.GetInstructionIndexEx(); + } + + // similar to ContainsInstructionIndex, but allows RA_INTER_RANGE_START/END as input + bool ContainsInstructionIndexEx(sint32 instructionIndex) const + { + if(instructionIndex == RA_INTER_RANGE_START) + return start.ConnectsToPreviousSegment(); + if(instructionIndex == RA_INTER_RANGE_END) + return end.ConnectsToNextSegment(); + return instructionIndex >= start.GetInstructionIndexEx() && instructionIndex <= end.GetInstructionIndexEx(); + } + + bool ContainsEdge(const raInstructionEdge& edge) const + { + return edge >= start && edge <= end; + } + + bool ContainsWholeInterval(const raInterval& other) const + { + return other.start >= start && other.end <= end; + } + + bool IsOverlapping(const raInterval& other) const + { + return start <= other.end && end >= other.start; + } + + sint32 GetPreciseDistance() + { + cemu_assert_debug(!start.ConnectsToNextSegment()); // how to handle this? + if(start == end) + return 1; + cemu_assert_debug(!end.ConnectsToPreviousSegment() && !end.ConnectsToNextSegment()); + if(start.ConnectsToPreviousSegment()) + return end.GetRaw() + 1; + + return end.GetRaw() - start.GetRaw() + 1; // +1 because end is inclusive + } + +//private: not making these directly accessible only forces us to create loads of verbose getters and setters + raInstructionEdge start; + raInstructionEdge end; +}; + +struct raFixedRegRequirement +{ + raInstructionEdge pos; + IMLPhysRegisterSet allowedReg; +}; + struct raLivenessRange { IMLSegment* imlSegment; - IMLSegmentPoint start; - IMLSegmentPoint end; + raInterval interval2; + // dirty state tracking bool _noLoad; bool hasStore; @@ -34,28 +310,34 @@ struct raLivenessRange boost::container::small_vector previousRanges; // processing uint32 lastIterationIndex; - // instruction locations + // instruction read/write locations std::vector list_locations; + // ordered list of all raInstructionEdge indices which require a fixed register + std::vector list_fixedRegRequirements; // linked list (subranges with same GPR virtual register) raLivenessSubrangeLink link_sameVirtualRegister; // linked list (all subranges for this segment) raLivenessSubrangeLink link_allSegmentRanges; - // register mapping (constant) + // register info IMLRegID virtualRegister; IMLName name; // register allocator result sint32 physicalRegister; boost::container::small_vector GetAllSubrangesInCluster(); + bool GetAllowedRegistersEx(IMLPhysRegisterSet& allowedRegisters); // if the cluster has fixed register requirements in any instruction this returns the combined register mask. Otherwise returns false in which case allowedRegisters is left undefined + IMLPhysRegisterSet GetAllowedRegisters(IMLPhysRegisterSet regPool); // return regPool with fixed register requirements filtered out IMLRegID GetVirtualRegister() const; sint32 GetPhysicalRegister() const; + bool HasPhysicalRegister() const { return physicalRegister >= 0; } IMLName GetName() const; void SetPhysicalRegister(sint32 physicalRegister); void SetPhysicalRegisterForCluster(sint32 physicalRegister); + void UnsetPhysicalRegister() { physicalRegister = -1; } }; -raLivenessRange* PPCRecRA_createSubrange(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, sint32 startIndex, sint32 endIndex); +raLivenessRange* PPCRecRA_createSubrange2(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, IMLRegID virtualRegister, IMLName name, raInstructionEdge startPosition, raInstructionEdge endPosition); void PPCRecRA_deleteSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange); void PPCRecRA_deleteAllRanges(ppcImlGenContext_t* ppcImlGenContext); @@ -63,7 +345,7 @@ void PPCRecRA_explodeRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange void PPCRecRA_mergeSubranges(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange, raLivenessRange* absorbedSubrange); -raLivenessRange* PPCRecRA_splitLocalSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange* subrange, sint32 splitIndex, bool trimToHole = false); +raLivenessRange* PPCRecRA_splitLocalSubrange2(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange*& subrange, raInstructionEdge splitPosition, bool trimToHole = false); void PPCRecRA_updateOrAddSubrangeLocation(raLivenessRange* subrange, sint32 index, bool isRead, bool isWrite); void PPCRecRA_debugValidateSubrange(raLivenessRange* subrange); @@ -71,8 +353,5 @@ void PPCRecRA_debugValidateSubrange(raLivenessRange* subrange); // cost estimation sint32 PPCRecRARange_getReadWriteCost(IMLSegment* imlSegment); sint32 PPCRecRARange_estimateCostAfterRangeExplode(raLivenessRange* subrange); -sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, sint32 splitIndex); - -// special values to mark the index of ranges that reach across the segment border -#define RA_INTER_RANGE_START (-1) -#define RA_INTER_RANGE_END (0x70000000) +//sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, sint32 splitIndex); +sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition); \ No newline at end of file diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h index 0589d660..10e3dc06 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLSegment.h @@ -3,12 +3,121 @@ #include +// special values to mark the index of ranges that reach across the segment border +#define RA_INTER_RANGE_START (-1) +#define RA_INTER_RANGE_END (0x70000000) + struct IMLSegmentPoint { + friend struct IMLSegmentInterval; + sint32 index; - struct IMLSegment* imlSegment; + struct IMLSegment* imlSegment; // do we really need to track this? SegmentPoints are always accessed via the segment that they are part of IMLSegmentPoint* next; IMLSegmentPoint* prev; + + // the index is the instruction index times two. + // this gives us the ability to cover half an instruction with RA ranges + // covering only the first half of an instruction (0-0) means that the register is read, but not preserved + // covering first and the second half means the register is read and preserved + // covering only the second half means the register is written but not read + + sint32 GetInstructionIndex() const + { + return index; + } + + void SetInstructionIndex(sint32 index) + { + this->index = index; + } + + void ShiftIfAfter(sint32 instructionIndex, sint32 shiftCount) + { + if (!IsPreviousSegment() && !IsNextSegment()) + { + if (GetInstructionIndex() >= instructionIndex) + index += shiftCount; + } + } + + void DecrementByOneInstruction() + { + index--; + } + + // the segment point can point beyond the first and last instruction which indicates that it is an infinite range reaching up to the previous or next segment + bool IsPreviousSegment() const { return index == RA_INTER_RANGE_START; } + bool IsNextSegment() const { return index == RA_INTER_RANGE_END; } + + // overload operand > and < + bool operator>(const IMLSegmentPoint& other) const { return index > other.index; } + bool operator<(const IMLSegmentPoint& other) const { return index < other.index; } + bool operator==(const IMLSegmentPoint& other) const { return index == other.index; } + bool operator!=(const IMLSegmentPoint& other) const { return index != other.index; } + + // overload comparison operands for sint32 + bool operator>(const sint32 other) const { return index > other; } + bool operator<(const sint32 other) const { return index < other; } + bool operator<=(const sint32 other) const { return index <= other; } + bool operator>=(const sint32 other) const { return index >= other; } +}; + +struct IMLSegmentInterval +{ + IMLSegmentPoint start; + IMLSegmentPoint end; + + bool ContainsInstructionIndex(sint32 offset) const { return start <= offset && end > offset; } + + bool IsRangeOverlapping(const IMLSegmentInterval& other) + { + // todo - compare the raw index + sint32 r1start = this->start.GetInstructionIndex(); + sint32 r1end = this->end.GetInstructionIndex(); + sint32 r2start = other.start.GetInstructionIndex(); + sint32 r2end = other.end.GetInstructionIndex(); + if (r1start < r2end && r1end > r2start) + return true; + if (this->start.IsPreviousSegment() && r1start == r2start) + return true; + if (this->end.IsNextSegment() && r1end == r2end) + return true; + return false; + } + + bool ExtendsIntoPreviousSegment() const + { + return start.IsPreviousSegment(); + } + + bool ExtendsIntoNextSegment() const + { + return end.IsNextSegment(); + } + + bool IsNextSegmentOnly() const + { + if(!start.IsNextSegment()) + return false; + cemu_assert_debug(end.IsNextSegment()); + return true; + } + + bool IsPreviousSegmentOnly() const + { + if (!end.IsPreviousSegment()) + return false; + cemu_assert_debug(start.IsPreviousSegment()); + return true; + } + + sint32 GetDistance() const + { + // todo - assert if either start or end is outside the segment + // we may also want to switch this to raw indices? + return end.GetInstructionIndex() - start.GetInstructionIndex(); + } }; struct PPCSegmentRegisterAllocatorInfo_t diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp index 846426f5..db48b9c0 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp @@ -18,6 +18,8 @@ #include "BackendX64/BackendX64.h" #include "util/highresolutiontimer/HighResolutionTimer.h" +#define PPCREC_FORCE_SYNCHRONOUS_COMPILATION 0 // if 1, then function recompilation will block and execute on the thread that called PPCRecompiler_visitAddressNoBlock + struct PPCInvalidationRange { MPTR startAddress; @@ -41,11 +43,36 @@ void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)(); PPCRecompilerInstanceData_t* ppcRecompilerInstanceData; +#if PPCREC_FORCE_SYNCHRONOUS_COMPILATION +static std::mutex s_singleRecompilationMutex; +#endif + bool ppcRecompilerEnabled = false; +void PPCRecompiler_recompileAtAddress(uint32 address); + // this function does never block and can fail if the recompiler lock cannot be acquired immediately void PPCRecompiler_visitAddressNoBlock(uint32 enterAddress) { +#if PPCREC_FORCE_SYNCHRONOUS_COMPILATION + if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] != PPCRecompiler_leaveRecompilerCode_unvisited) + return; + PPCRecompilerState.recompilerSpinlock.lock(); + if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] != PPCRecompiler_leaveRecompilerCode_unvisited) + { + PPCRecompilerState.recompilerSpinlock.unlock(); + return; + } + ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] = PPCRecompiler_leaveRecompilerCode_visited; + PPCRecompilerState.recompilerSpinlock.unlock(); + s_singleRecompilationMutex.lock(); + if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] == PPCRecompiler_leaveRecompilerCode_visited) + { + PPCRecompiler_recompileAtAddress(enterAddress); + } + s_singleRecompilationMutex.unlock(); + return; +#endif // quick read-only check without lock if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] != PPCRecompiler_leaveRecompilerCode_unvisited) return; @@ -154,6 +181,9 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP } } + // if(range.startAddress < 0x0202fa3C || range.startAddress > 0x0202FA7C) + // return nullptr; // DEBUG + PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t(); ppcRecFunc->ppcAddress = range.startAddress; ppcRecFunc->ppcSize = range.length; @@ -182,6 +212,85 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP return nullptr; } } + // DEBUG BEGIN + // if(ppcRecFunc->ppcAddress != 0x2BDA9F4) // TP + // { + // delete ppcRecFunc; + // return nullptr; + // } + // if(ppcRecFunc->ppcAddress < 0x2BDA9F4) // TP + // { + // delete ppcRecFunc; + // return nullptr; + // } + + // this prevents the crashing + // if((ppcRecFunc->ppcAddress >= 0x02ade400 && ppcRecFunc->ppcAddress < 0x02ade600)) -> no crash + //if((ppcRecFunc->ppcAddress >= 0x02ade500 && ppcRecFunc->ppcAddress < 0x02ade600)) -> no crash + // if((ppcRecFunc->ppcAddress >= 0x02ade580 && ppcRecFunc->ppcAddress < 0x02ade600)) // -> crashed around 0x0x2b874b0 (but rare? Out of 5 runs it only crashed once) + // { + // delete ppcRecFunc; + // return nullptr; + // } + // the problem with Shovel Knight is that the crash seems to be pretty instable, at least when trying to narrow it down. Lets look for another game for now + + // check TP bug... + // if(ppcRecFunc->ppcAddress >= 0x03000000) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02800000) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02C00000) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02A00000) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B00000) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B80000) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B40000) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B60000) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B70000) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B68000) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B64000) -> no bug (I went into wrong direction) + // if(ppcRecFunc->ppcAddress >= 0x02B6C000) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B6A000) -> has bug (double checked, it has bug) + // if(ppcRecFunc->ppcAddress >= 0x02B6B000) -> has bug (I went into the wrong direction again? Or does A000 have no bug?? + // if(ppcRecFunc->ppcAddress >= 0x02B69000) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B68800) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B68400) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B68600) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B68500) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B68580) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B685C0) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B685A0) -> has bug + // if(ppcRecFunc->ppcAddress >= 0x02B68590) -> no bug + // if(ppcRecFunc->ppcAddress >= 0x02B68598) -> has bug + + // if(ppcRecFunc->ppcAddress != 0x02B68594) -> seems fine. No bug (against the expectation) + // if(ppcRecFunc->ppcAddress == 0x02B68594) -> Still has the bug + + // if(ppcRecFunc->ppcAddress == 0x02B68594) + // { + // delete ppcRecFunc; + // return nullptr; + // } + // if(ppcRecFunc->ppcAddress >= 0x2B7A8D4 && ppcRecFunc->ppcAddress < 0x02B7AC9C && ppcRecFunc->ppcAddress != 0x2B7A8D4) + // { + // delete ppcRecFunc; + // return nullptr; + // } + // doing both of these means no bug! + // excluding just ppcAddress == 0x2B7A8D4 is enough to trigger the bug again. So it definitely that function + // next: Debug it! + + // In Pikmin 3 030a9998 is broken? + // if(!(ppcRecFunc->ppcAddress >= 0x030a9998 && ppcRecFunc->ppcAddress < 0x030AA208)) + // { + // delete ppcRecFunc; + // return nullptr; + // } + // else + // { + // delete ppcRecFunc; + // return nullptr; + // } + + // DEBUG END + // apply passes if (!PPCRecompiler_ApplyIMLPasses(ppcImlGenContext)) @@ -190,13 +299,58 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP return nullptr; } - //if (ppcRecFunc->ppcAddress == 0x30DF5F8) - //{ - // debug_printf("----------------------------------------\n"); - // IMLDebug_Dump(&ppcImlGenContext); - // __debugbreak(); - //} + // TP + // if (ppcRecFunc->ppcAddress == 0x2B7A8D4) + // { + // debug_printf("----------------------------------------\n"); + // IMLDebug_Dump(&ppcImlGenContext); + // //__debugbreak(); + // } + // // Bad Function in SM3DW + // if (ppcRecFunc->ppcAddress == 0x023D5768) + // { + // debug_printf("----------------------------------------\n"); + // IMLDebug_Dump(&ppcImlGenContext); + // } + // if (ppcRecFunc->ppcAddress >= 0x023D5768 && ppcRecFunc->ppcAddress < 0x023D58DC) + // { + // delete ppcRecFunc; + // return nullptr; + // } + // + // + // // 0x02846c74 + // if (ppcRecFunc->ppcAddress == 0x02846c74) + // { + // debug_printf("----------------------------------------\n"); + // IMLDebug_Dump(&ppcImlGenContext); + // __debugbreak(); + // } + + // Shovel Knight + // if (ppcRecFunc->ppcAddress >= 0x02A1E630 && ppcRecFunc->ppcAddress < 0x02A1E9D8) + // { + // // debug_printf("----------------------------------------\n"); + // // IMLDebug_Dump(&ppcImlGenContext); + // // __debugbreak(); + // delete ppcRecFunc; + // return nullptr; + // } + // + // // + // if (ppcRecFunc->ppcAddress == 0x02ade5c4 || ppcRecFunc->ppcAddress == 0x02ade5c8) + // { + // // debug_printf("----------------------------------------\n"); + // IMLDebug_Dump(&ppcImlGenContext); + // __debugbreak(); + // } + + // else + // { + // delete ppcRecFunc; + // return nullptr; + // } //if (ppcRecFunc->ppcAddress == 0x11223344) //{ @@ -210,14 +364,26 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP // return nullptr; //} - //if (ppcRecFunc->ppcAddress == 0x03C26844) - //{ - // __debugbreak(); - // IMLDebug_Dump(&ppcImlGenContext); - // __debugbreak(); - //} + // if (ppcRecFunc->ppcAddress >= 0x2BDA9F4 && ppcRecFunc->ppcAddress < 0x02BDAB38) + // { + // return nullptr; + // //IMLDebug_Dump(&ppcImlGenContext); + // //__debugbreak(); + // } + + // if (ppcRecFunc->ppcAddress == 0x2BDA9F4) + // { + // IMLDebug_Dump(&ppcImlGenContext); + // __debugbreak(); + // } // 31A8778 + // if(ppcRecFunc->ppcAddress >= 0x2759E20 && ppcRecFunc->ppcAddress < 0x0275A0CC) + // { + // delete ppcRecFunc; + // return nullptr; + // } + // Functions for testing (botw): // 3B4049C (large with switch case) // 30BF118 (has a bndz copy loop + some float instructions at the end) @@ -231,6 +397,14 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP return nullptr; } + if (ppcRecFunc->ppcAddress == 0x2B7A8D4) + { + // write code to binary file + FILE* f = fopen("ppcRecFunc_2B7A8D4.bin", "wb"); + fwrite(ppcRecFunc->x86Code, 1, ppcRecFunc->x86Size, f); + fclose(f); + } + // collect list of PPC-->x64 entry points entryPointsOut.clear(); for(IMLSegment* imlSegment : ppcImlGenContext.segmentList2) @@ -255,7 +429,7 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP codeHash += ((uint8*)ppcRecFunc->x86Code)[i]; } - //cemuLog_log(LogType::Force, "[Recompiler] PPC 0x{:08x} -> x64: 0x{:x} Took {:.4}ms | Size {:04x} CodeHash {:08x}", (uint32)ppcRecFunc->ppcAddress, (uint64)(uintptr_t)ppcRecFunc->x86Code, bt.GetElapsedMilliseconds(), ppcRecFunc->x86Size, codeHash); + cemuLog_log(LogType::Force, "[Recompiler] PPC 0x{:08x} -> x64: 0x{:x} Took {:.4}ms | Size {:04x} CodeHash {:08x}", (uint32)ppcRecFunc->ppcAddress, (uint64)(uintptr_t)ppcRecFunc->x86Code, bt.GetElapsedMilliseconds(), ppcRecFunc->x86Size, codeHash); return ppcRecFunc; } @@ -323,11 +497,14 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext) //PPCRecompiler_reorderConditionModifyInstructions(&ppcImlGenContext); //PPCRecompiler_removeRedundantCRUpdates(&ppcImlGenContext); -// if(ppcImlGenContext.debug_entryPPCAddress == 0x0200E1E8) -// { -// IMLDebug_Dump(&ppcImlGenContext); -// __debugbreak(); -// } + + // if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8 && ppcImlGenContext.debug_entryPPCAddress < 0x0240C0AC) + // { + // IMLDebug_Dump(&ppcImlGenContext); + // __debugbreak(); + // } + // else if(ppcImlGenContext.debug_entryPPCAddress >= 0x0240B7F8) + // return false; return true; } @@ -438,6 +615,10 @@ std::atomic_bool s_recompilerThreadStopSignal{false}; void PPCRecompiler_thread() { SetThreadName("PPCRecompiler"); +#if PPCREC_FORCE_SYNCHRONOUS_COMPILATION + return; +#endif + while (true) { if(s_recompilerThreadStopSignal) @@ -765,4 +946,4 @@ void PPCRecompiler_Shutdown() // mark as unmapped ppcRecompiler_reservedBlockMask[i] = false; } -} \ No newline at end of file +} diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp index b89b7f7c..b637b594 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGen.cpp @@ -1746,7 +1746,7 @@ uint32 PPCRecompiler_getPreviousInstruction(ppcImlGenContext_t* ppcImlGenContext void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index) { segmentPoint->imlSegment = imlSegment; - segmentPoint->index = index; + segmentPoint->SetInstructionIndex(index); if (imlSegment->segmentPointList) imlSegment->segmentPointList->prev = segmentPoint; segmentPoint->prev = nullptr; @@ -1766,7 +1766,7 @@ void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint) /* * Insert multiple no-op instructions -* Warning: Can invalidate any previous instruction structs from the same segment +* Warning: Can invalidate any previous instruction pointers from the same segment */ void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, sint32 shiftBackCount) { @@ -1788,12 +1788,7 @@ void PPCRecompiler_pushBackIMLInstructions(IMLSegment* imlSegment, sint32 index, IMLSegmentPoint* segmentPoint = imlSegment->segmentPointList; while (segmentPoint) { - if (segmentPoint->index != RA_INTER_RANGE_START && segmentPoint->index != RA_INTER_RANGE_END) - { - if (segmentPoint->index >= index) - segmentPoint->index += shiftBackCount; - } - // next + segmentPoint->ShiftIfAfter(index, shiftBackCount); segmentPoint = segmentPoint->next; } } @@ -2864,6 +2859,76 @@ bool PPCIMLGen_FillBasicBlock(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBloc { uint32 addressOfCurrentInstruction = (uint32)((uint8*)ppcImlGenContext.currentInstruction - memory_base); ppcImlGenContext.ppcAddressOfCurrentInstruction = addressOfCurrentInstruction; + + // DEBUG BEGIN + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7A8D4+0x10) -> stops bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7A9C0) -> has bug (optional code path) + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AA50) -> stops bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AC34) -> stops bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AC78) -> has bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AC70) -> has bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AC88) -> has bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AC3C) -> has bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AC38) -> no bug + // weirdly, excluding 0x02B7AC38 fixes the issue. Excluding both 0x02B7AC3C and 0x2B7AC88 (the follow up instructions) does not fix the bug + + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7ABE4) -> has bug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AAD0) -> fixes bug + + // maybe try to place as many leave instructions as possible while keeping the bug alive + // eventually we should end up with a relatively small IR footprint that is easier to analyze + + // 0x023d5818 + // SM3DW debug + // if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x23D58A8) + // { + // ppcImlGenContext.emitInst().make_macro(PPCREC_IML_MACRO_DEBUGBREAK, ppcImlGenContext.ppcAddressOfCurrentInstruction, 0, 0, IMLREG_INVALID); + // } + +#if 0 // TP + if(ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AC78 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AC70 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7A9C0 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AC3C || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AADC || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7ABE4 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7ABC0 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7ABA8 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AB90 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AB04 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02b7abc4 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7A9B0 || // verified + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02b7aa10 -> fixes bug (this is after a bl) + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AA3C || // verified + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7AA44 -> fixes bug (this is on the main path, the one before, 0x02B7AA3C, does not break) + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AADC || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7ABC4 || // verified + ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02b7ac88 || // verified + // ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02b7aad0 || -> fixes it + // ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02b7aa30 || -> fixes it (mostly. There was a small glitch on eponas tail?) + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02b7aa24 || -> this fixes it + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7A918 || -> this fixes it + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7A9A0 || -> this fixes it + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x02B7AC38 || -> this fixes it + //ppcImlGenContext.ppcAddressOfCurrentInstruction == 0x2B7A8D4 || -> this fixes it + (ppcImlGenContext.ppcAddressOfCurrentInstruction >= 0x2B7AC44 && ppcImlGenContext.ppcAddressOfCurrentInstruction <= 0x2B7AC84) || // verified + (ppcImlGenContext.ppcAddressOfCurrentInstruction >= 0x02B7AADC && ppcImlGenContext.ppcAddressOfCurrentInstruction <= 0x2B7ABC0) || // verified + (ppcImlGenContext.ppcAddressOfCurrentInstruction >= 0x2B7A9B0 && ppcImlGenContext.ppcAddressOfCurrentInstruction <= 0x02B7AA0C) || + (ppcImlGenContext.ppcAddressOfCurrentInstruction >= 0x02B7AAE4 && ppcImlGenContext.ppcAddressOfCurrentInstruction <= 0x02b7ac20) // verified + + // disabling IMLOptimizerX86_SubstituteCJumpForEflagsJump fixes it... + + //(ppcImlGenContext.ppcAddressOfCurrentInstruction >= 0x2B7AA1C && ppcImlGenContext.ppcAddressOfCurrentInstruction <= 0x02B7AA40) -> fixes it + ) + { + ppcImlGenContext.emitInst().make_macro(PPCREC_IML_MACRO_LEAVE, ppcImlGenContext.ppcAddressOfCurrentInstruction, 0, 0, IMLREG_INVALID); + // this doesnt work any longer because the basic blocks are determined before the recompiler is called + basicBlockInfo.GetSegmentForInstructionAppend()->SetLinkBranchTaken(nullptr); + basicBlockInfo.GetSegmentForInstructionAppend()->SetLinkBranchNotTaken(nullptr); + break; // but we should be able to just exit the block early? + } +#endif + if (PPCRecompiler_decodePPCInstruction(&ppcImlGenContext)) { debug_printf("Recompiler encountered unsupported instruction at 0x%08x\n", addressOfCurrentInstruction);