mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-01-09 19:27:32 -03:00
Latte: Optimizations and tweaks (#706)
This commit is contained in:
parent
323bdfa183
commit
90c56b7731
10 changed files with 822 additions and 482 deletions
File diff suppressed because it is too large
Load diff
|
@ -26,6 +26,7 @@ struct OverlayStats
|
|||
|
||||
double fps{};
|
||||
uint32 draw_calls_per_frame{};
|
||||
uint32 fast_draw_calls_per_frame{};
|
||||
float cpu_usage{}; // cemu cpu usage in %
|
||||
std::vector<float> cpu_per_core; // global cpu usage in % per core
|
||||
uint32 ram_usage{}; // ram usage in MB
|
||||
|
@ -86,7 +87,7 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
|
|||
ImGui::Text("FPS: %.2lf", g_state.fps);
|
||||
|
||||
if (config.overlay.drawcalls)
|
||||
ImGui::Text("Draws/f: %d", g_state.draw_calls_per_frame);
|
||||
ImGui::Text("Draws/f: %d (fast: %d)", g_state.draw_calls_per_frame, g_state.fast_draw_calls_per_frame);
|
||||
|
||||
if (config.overlay.cpu_usage)
|
||||
ImGui::Text("CPU: %.2lf%%", g_state.cpu_usage);
|
||||
|
@ -588,13 +589,14 @@ static void UpdateStats_CpuPerCore()
|
|||
}
|
||||
}
|
||||
|
||||
void LatteOverlay_updateStats(double fps, sint32 drawcalls)
|
||||
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls)
|
||||
{
|
||||
if (GetConfig().overlay.position == ScreenPosition::kDisabled)
|
||||
return;
|
||||
|
||||
g_state.fps = fps;
|
||||
g_state.draw_calls_per_frame = drawcalls;
|
||||
g_state.fast_draw_calls_per_frame = fastDrawcalls;
|
||||
UpdateStats_CemuCpu();
|
||||
UpdateStats_CpuPerCore();
|
||||
|
||||
|
|
|
@ -2,6 +2,6 @@
|
|||
|
||||
void LatteOverlay_init();
|
||||
void LatteOverlay_render(bool pad_view);
|
||||
void LatteOverlay_updateStats(double fps, sint32 drawcalls);
|
||||
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls);
|
||||
|
||||
void LatteOverlay_pushNotification(const std::string& text, sint32 duration);
|
|
@ -38,6 +38,7 @@ void LattePerformanceMonitor_frameEnd()
|
|||
uint64 indexDataCached = 0;
|
||||
uint32 frameCounter = 0;
|
||||
uint32 drawCallCounter = 0;
|
||||
uint32 fastDrawCallCounter = 0;
|
||||
uint32 shaderBindCounter = 0;
|
||||
uint32 recompilerLeaveCount = 0;
|
||||
uint32 threadLeaveCount = 0;
|
||||
|
@ -53,6 +54,7 @@ void LattePerformanceMonitor_frameEnd()
|
|||
indexDataCached += performanceMonitor.cycle[i].indexDataCached;
|
||||
frameCounter += performanceMonitor.cycle[i].frameCounter;
|
||||
drawCallCounter += performanceMonitor.cycle[i].drawCallCounter;
|
||||
fastDrawCallCounter += performanceMonitor.cycle[i].fastDrawCallCounter;
|
||||
shaderBindCounter += performanceMonitor.cycle[i].shaderBindCount;
|
||||
recompilerLeaveCount += performanceMonitor.cycle[i].recompilerLeaveCount;
|
||||
threadLeaveCount += performanceMonitor.cycle[i].threadLeaveCount;
|
||||
|
@ -75,7 +77,6 @@ void LattePerformanceMonitor_frameEnd()
|
|||
indexDataUploadPerFrame /= 1024ULL;
|
||||
|
||||
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
|
||||
uint32 drawCallsPerFrame = drawCallCounter / elapsedFrames;
|
||||
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
|
||||
passedCycles = passedCycles * 1000ULL / totalElapsedTime;
|
||||
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
||||
|
@ -85,6 +86,7 @@ void LattePerformanceMonitor_frameEnd()
|
|||
// next counter cycle
|
||||
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
|
||||
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
|
||||
performanceMonitor.cycle[nextCycleIndex].fastDrawCallCounter = 0;
|
||||
performanceMonitor.cycle[nextCycleIndex].frameCounter = 0;
|
||||
performanceMonitor.cycle[nextCycleIndex].shaderBindCount = 0;
|
||||
performanceMonitor.cycle[nextCycleIndex].lastCycleCount = PPCInterpreter_getMainCoreCycleCounter();
|
||||
|
@ -104,12 +106,12 @@ void LattePerformanceMonitor_frameEnd()
|
|||
|
||||
if (isFirstUpdate)
|
||||
{
|
||||
LatteOverlay_updateStats(0.0, 0);
|
||||
LatteOverlay_updateStats(0.0, 0, 0);
|
||||
gui_updateWindowTitles(false, false, 0.0);
|
||||
}
|
||||
else
|
||||
{
|
||||
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames);
|
||||
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames, fastDrawCallCounter / elapsedFrames);
|
||||
gui_updateWindowTitles(false, false, fps);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,6 +84,7 @@ typedef struct
|
|||
uint32 lastUpdate;
|
||||
uint32 frameCounter;
|
||||
uint32 drawCallCounter;
|
||||
uint32 fastDrawCallCounter;
|
||||
uint32 shaderBindCount;
|
||||
uint64 vertexDataUploaded; // amount of vertex data uploaded to GPU (bytes)
|
||||
uint64 vertexDataCached; // amount of vertex data reused from GPU cache (bytes)
|
||||
|
|
|
@ -295,6 +295,34 @@ LatteTextureView* LatteMRT::GetColorAttachmentTexture(uint32 index, bool createN
|
|||
uint32 colorBufferHeight = pitchHeight / colorBufferPitch;
|
||||
uint32 colorBufferWidth = colorBufferPitch;
|
||||
|
||||
// colorbuffer width/height has to be padded to 8/32 alignment but the actual resolution might be smaller
|
||||
// use the scissor box as a clue to figure out the original resolution if possible
|
||||
#if 0
|
||||
uint32 scissorBoxWidth = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_X();
|
||||
uint32 scissorBoxHeight = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_Y();
|
||||
if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth)
|
||||
colorBufferWidth = scissorBoxWidth;
|
||||
if (((colorBufferHeight + 31) & ~31) == colorBufferHeight)
|
||||
colorBufferHeight = scissorBoxHeight;
|
||||
#endif
|
||||
|
||||
// log resolution changes if the above heuristic takes effect
|
||||
// this is useful to find resolutions which need to be updated in gfx pack texture rules
|
||||
#if 0
|
||||
uint32 colorBufferHeight2 = pitchHeight / colorBufferPitch;
|
||||
static std::unordered_set<uint64> s_foundColorBufferResMappings;
|
||||
if (colorBufferPitch != colorBufferWidth || colorBufferHeight != colorBufferHeight2)
|
||||
{
|
||||
// only log unique, source and dest resolution. Encode into a key with 16 bits per component
|
||||
uint64 resHash = (uint64)colorBufferWidth | ((uint64)colorBufferHeight << 16) | ((uint64)colorBufferPitch << 32) | ((uint64)colorBufferHeight2 << 48);
|
||||
if( !s_foundColorBufferResMappings.contains(resHash) )
|
||||
{
|
||||
s_foundColorBufferResMappings.insert(resHash);
|
||||
cemuLog_log(LogType::Force, "[COLORBUFFER-DBG] Using res {}x{} instead of {}x{}", colorBufferWidth, colorBufferHeight, colorBufferPitch, colorBufferHeight2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
bool colorBufferWasFound = false;
|
||||
sint32 viewFirstMip = 0; // todo
|
||||
|
||||
|
|
|
@ -8,10 +8,11 @@
|
|||
#include "Cafe/HW/Latte/Core/LatteTexture.h"
|
||||
#include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h"
|
||||
|
||||
// #define LOG_READBACK_TIME
|
||||
//#define LOG_READBACK_TIME
|
||||
|
||||
struct LatteTextureReadbackQueueEntry
|
||||
{
|
||||
HRTick initiateTime;
|
||||
uint32 lastUpdateDrawcallIndex;
|
||||
LatteTextureView* textureView;
|
||||
};
|
||||
|
@ -22,12 +23,12 @@ std::queue<LatteTextureReadbackInfo*> sTextureActiveReadbackQueue; // readbacks
|
|||
void LatteTextureReadback_StartTransfer(LatteTextureView* textureView)
|
||||
{
|
||||
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Start] PhysAddr {:08x} Res {}x{} Fmt {} Slice {} Mip {}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->format, textureView->firstSlice, textureView->firstMip);
|
||||
HRTick currentTick = HighResolutionTimer().now().getTick();
|
||||
// create info entry and store in ordered linked list
|
||||
LatteTextureReadbackInfo* readbackInfo = g_renderer->texture_createReadback(textureView);
|
||||
sTextureActiveReadbackQueue.push(readbackInfo);
|
||||
readbackInfo->StartTransfer();
|
||||
//debug_printf("[Tex-Readback] %08x %dx%d TM %d FMT %04x\n", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->tileMode, textureView->baseTexture->format);
|
||||
readbackInfo->transferStartTime = HighResolutionTimer().now().getTick();
|
||||
readbackInfo->transferStartTime = currentTick;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -41,9 +42,15 @@ bool LatteTextureReadback_Update(bool forceStart)
|
|||
for (size_t i = 0; i < sTextureScheduledReadbacks.size(); i++)
|
||||
{
|
||||
LatteTextureReadbackQueueEntry& entry = sTextureScheduledReadbacks[i];
|
||||
uint32 numPassedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
|
||||
if (forceStart || numPassedDrawcalls >= 5)
|
||||
uint32 numElapsedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
|
||||
if (forceStart || numElapsedDrawcalls >= 5)
|
||||
{
|
||||
#ifdef LOG_READBACK_TIME
|
||||
double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick());
|
||||
char initiateElapsedTimeStr[32];
|
||||
sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate);
|
||||
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no");
|
||||
#endif
|
||||
LatteTextureReadback_StartTransfer(entry.textureView);
|
||||
// remove element
|
||||
vectorRemoveByIndex(sTextureScheduledReadbacks, i);
|
||||
|
@ -91,6 +98,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView)
|
|||
}
|
||||
// queue
|
||||
LatteTextureReadbackQueueEntry queueEntry;
|
||||
queueEntry.initiateTime = HighResolutionTimer().now().getTick();
|
||||
queueEntry.textureView = textureView;
|
||||
queueEntry.lastUpdateDrawcallIndex = LatteGPUState.drawCallCounter;
|
||||
sTextureScheduledReadbacks.emplace_back(queueEntry);
|
||||
|
@ -112,6 +120,14 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
|
|||
if (!readbackInfo->IsFinished())
|
||||
{
|
||||
readbackInfo->waitStartTime = HighResolutionTimer().now().getTick();
|
||||
#ifdef LOG_READBACK_TIME
|
||||
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
|
||||
{
|
||||
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick());
|
||||
forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
|
||||
}
|
||||
#endif
|
||||
readbackInfo->forceFinish = true;
|
||||
readbackInfo->ForceFinish();
|
||||
// rerun logic since ->ForceFinish() can recurively call this function and thus modify the queue
|
||||
continue;
|
||||
|
@ -125,10 +141,13 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
|
|||
}
|
||||
// performance testing
|
||||
#ifdef LOG_READBACK_TIME
|
||||
HRTick currentTick = HighResolutionTimer().now().getTick();
|
||||
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
|
||||
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
|
||||
cemuLog_log(LogType::Force, "[Texture-Readback] {:08x} Res {:4}/{:4} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, forceFinish?"yes":"no");
|
||||
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
|
||||
{
|
||||
HRTick currentTick = HighResolutionTimer().now().getTick();
|
||||
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
|
||||
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
|
||||
forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
|
||||
}
|
||||
#endif
|
||||
uint8* pixelData = readbackInfo->GetData();
|
||||
LatteTextureLoader_writeReadbackTextureToMemory(&readbackInfo->hostTextureCopy, 0, 0, pixelData);
|
||||
|
|
|
@ -21,6 +21,7 @@ public:
|
|||
|
||||
HRTick transferStartTime;
|
||||
HRTick waitStartTime;
|
||||
bool forceFinish{ false }; // set to true if not finished in time for dependent operation
|
||||
// texture info
|
||||
LatteTextureDefinition hostTextureCopy{};
|
||||
|
||||
|
|
|
@ -484,7 +484,7 @@ namespace Latte
|
|||
SQ_TEX_RESOURCE_WORD0_N_GS = 0xE930,
|
||||
SQ_TEX_RESOURCE_WORD_FIRST = SQ_TEX_RESOURCE_WORD0_N_PS,
|
||||
SQ_TEX_RESOURCE_WORD_LAST = (SQ_TEX_RESOURCE_WORD0_N_GS + GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7 - 1),
|
||||
// there are 54 samplers with 3 registers each. 18 per stage. For stage indices see SAMPLER_BASE_INDEX_*
|
||||
// there are 54 samplers with 3 registers each. 18 (actually only 16?) per stage. For stage indices see SAMPLER_BASE_INDEX_*
|
||||
SQ_TEX_SAMPLER_WORD0_0 = 0xF000,
|
||||
SQ_TEX_SAMPLER_WORD1_0 = 0xF001,
|
||||
SQ_TEX_SAMPLER_WORD2_0 = 0xF002,
|
||||
|
|
|
@ -2002,7 +2002,7 @@ void VulkanRenderer::SubmitCommandBuffer(VkSemaphore signalSemaphore, VkSemaphor
|
|||
occlusionQuery_notifyBeginCommandBuffer();
|
||||
|
||||
m_recordedDrawcalls = 0;
|
||||
m_submitThreshold = 500; // this used to be 750 before 1.25.5, but more frequent submission is actually better for latency
|
||||
m_submitThreshold = 300;
|
||||
m_submitOnIdle = false;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue