mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-01-25 02:33:06 -03:00
Vulkan: Add profiler for Vulkan API CPU cost
Disabled by default. Set VULKAN_API_CPU_BENCHMARK to 1 to enable
This commit is contained in:
parent
3e925b7707
commit
f9f6206929
5 changed files with 99 additions and 13 deletions
|
@ -8,7 +8,7 @@
|
|||
#include "Cafe/HW/Latte/Core/LatteTexture.h"
|
||||
#include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h"
|
||||
|
||||
//#define LOG_READBACK_TIME
|
||||
#define LOG_READBACK_TIME
|
||||
|
||||
struct LatteTextureReadbackQueueEntry
|
||||
{
|
||||
|
@ -47,9 +47,7 @@ bool LatteTextureReadback_Update(bool forceStart)
|
|||
{
|
||||
#ifdef LOG_READBACK_TIME
|
||||
double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick());
|
||||
char initiateElapsedTimeStr[32];
|
||||
sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate);
|
||||
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no");
|
||||
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {:.4} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, elapsedSecondsSinceInitiate, forceStart?"yes":"no");
|
||||
#endif
|
||||
LatteTextureReadback_StartTransfer(entry.textureView);
|
||||
// remove element
|
||||
|
@ -83,7 +81,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView)
|
|||
// currently we don't support readback for resized textures
|
||||
if (textureView->baseTexture->overwriteInfo.hasResolutionOverwrite)
|
||||
{
|
||||
cemuLog_log(LogType::Force, "_initate(): Readback is not supported for textures with modified resolution");
|
||||
cemuLog_log(LogType::Force, "Texture readback is not supported for textures with modified resolution. Texture: {:08x} {}x{}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height);
|
||||
return;
|
||||
}
|
||||
// check if texture isn't already queued for transfer
|
||||
|
@ -124,7 +122,7 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
|
|||
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
|
||||
{
|
||||
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick());
|
||||
forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
|
||||
cemuLog_log(LogType::TextureReadback, "[Texture-Readback] Force-finish: {:08x} Res {:}/{:} TM {:} FMT {:04x} Transfer time so far: {:.4}ms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
|
||||
}
|
||||
#endif
|
||||
readbackInfo->forceFinish = true;
|
||||
|
@ -146,7 +144,7 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
|
|||
HRTick currentTick = HighResolutionTimer().now().getTick();
|
||||
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
|
||||
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
|
||||
forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
|
||||
cemuLog_log(LogType::TextureReadback, "[Texture-Readback] {:08x} Res {}/{} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
|
||||
}
|
||||
#endif
|
||||
uint8* pixelData = readbackInfo->GetData();
|
||||
|
|
|
@ -1,13 +1,81 @@
|
|||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||
#define VKFUNC_DEFINE
|
||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||
#include <numeric> // for std::iota
|
||||
|
||||
#if BOOST_OS_LINUX || BOOST_OS_MACOS
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#define VULKAN_API_CPU_BENCHMARK 0 // if 1, Cemu will log the CPU time spent per Vulkan API function
|
||||
|
||||
bool g_vulkan_available = false;
|
||||
|
||||
#if VULKAN_API_CPU_BENCHMARK != 0
|
||||
uint64 s_vulkanBenchmarkLastResultsTime = 0;
|
||||
|
||||
struct VulkanBenchmarkFuncInfo
|
||||
{
|
||||
std::string funcName;
|
||||
uint64 cycles;
|
||||
uint32 numCalls;
|
||||
};
|
||||
|
||||
std::vector<VulkanBenchmarkFuncInfo*> s_vulkanBenchmarkFuncs;
|
||||
|
||||
template<typename TRet, typename... Args>
|
||||
auto VkWrapperFuncGenTest(TRet (*func)(Args...), const char* name)
|
||||
{
|
||||
static VulkanBenchmarkFuncInfo _FuncInfo;
|
||||
static auto _FuncPtrCopy = func;
|
||||
TRet (*newFunc)(Args...);
|
||||
if constexpr(std::is_void_v<TRet>)
|
||||
{
|
||||
newFunc = +[](Args... args) { uint64 t = __rdtsc(); _mm_mfence(); _FuncPtrCopy(args...); _mm_mfence(); _FuncInfo.cycles += (__rdtsc() - t); _FuncInfo.numCalls++; };
|
||||
}
|
||||
else
|
||||
newFunc = +[](Args... args) -> TRet { uint64 t = __rdtsc(); _mm_mfence(); TRet r = _FuncPtrCopy(args...); _mm_mfence(); _FuncInfo.cycles += (__rdtsc() - t); _FuncInfo.numCalls++; return r; };
|
||||
if(func && func != newFunc)
|
||||
_FuncPtrCopy = func;
|
||||
if(_FuncInfo.funcName.empty())
|
||||
{
|
||||
_FuncInfo = {.funcName = name, .cycles = 0, .numCalls = 0};
|
||||
s_vulkanBenchmarkFuncs.emplace_back(&_FuncInfo);
|
||||
}
|
||||
return newFunc;
|
||||
};
|
||||
#endif
|
||||
|
||||
// called when a TV SwapBuffers is called
|
||||
void VulkanBenchmarkPrintResults()
|
||||
{
|
||||
#if VULKAN_API_CPU_BENCHMARK != 0
|
||||
// note: This could be done by hooking vk present functions
|
||||
uint64 currentCycle = __rdtsc();
|
||||
uint64 elapsedCycles = currentCycle - s_vulkanBenchmarkLastResultsTime;
|
||||
s_vulkanBenchmarkLastResultsTime = currentCycle;
|
||||
double elapsedCyclesDbl = (double)elapsedCycles;
|
||||
cemuLog_log(LogType::Force, "--- Vulkan API CPU benchmark ---");
|
||||
cemuLog_log(LogType::Force, "Elapsed cycles this frame: {:} | Current cycle {:} | NumFunc {:}", elapsedCycles, currentCycle, s_vulkanBenchmarkFuncs.size());
|
||||
|
||||
std::vector<sint32> sortedIndices(s_vulkanBenchmarkFuncs.size());
|
||||
std::iota(sortedIndices.begin(), sortedIndices.end(), 0);
|
||||
std::sort(sortedIndices.begin(), sortedIndices.end(),
|
||||
[](int32_t a, int32_t b) {
|
||||
return s_vulkanBenchmarkFuncs[a]->cycles > s_vulkanBenchmarkFuncs[b]->cycles;
|
||||
});
|
||||
for (sint32 idx : sortedIndices)
|
||||
{
|
||||
auto& func = s_vulkanBenchmarkFuncs[idx];
|
||||
if(func->cycles == 0)
|
||||
return;
|
||||
cemuLog_log(LogType::Force, "{}: {} cycles ({:.4}%) {} calls", func->funcName.c_str(), func->cycles, ((double)func->cycles / elapsedCyclesDbl) * 100.0, func->numCalls);
|
||||
func->cycles = 0;
|
||||
func->numCalls = 0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if BOOST_OS_WINDOWS
|
||||
|
||||
bool InitializeGlobalVulkan()
|
||||
|
@ -57,7 +125,12 @@ bool InitializeDeviceVulkan(VkDevice device)
|
|||
|
||||
#define VKFUNC_DEVICE_INIT
|
||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||
|
||||
|
||||
#if VULKAN_API_CPU_BENCHMARK != 0
|
||||
#define VKFUNC_DEFINE_CUSTOM(__func) __func = VkWrapperFuncGenTest(__func, #__func)
|
||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -121,7 +194,12 @@ bool InitializeDeviceVulkan(VkDevice device)
|
|||
|
||||
#define VKFUNC_DEVICE_INIT
|
||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||
|
||||
|
||||
#if VULKAN_API_CPU_BENCHMARK != 0
|
||||
#define VKFUNC_DEFINE_CUSTOM(__func) __func = VkWrapperFuncGenTest(__func, #__func)
|
||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -14,7 +14,11 @@ extern bool g_vulkan_available;
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef VKFUNC_DEFINE
|
||||
#ifdef VKFUNC_DEFINE_CUSTOM
|
||||
#define VKFUNC(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__)
|
||||
#define VKFUNC_INSTANCE(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__)
|
||||
#define VKFUNC_DEVICE(__FUNC__) VKFUNC_DEFINE_CUSTOM(__FUNC__)
|
||||
#elif defined(VKFUNC_DEFINE)
|
||||
#define VKFUNC(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr
|
||||
#define VKFUNC_INSTANCE(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr
|
||||
#define VKFUNC_DEVICE(__FUNC__) NOEXPORT PFN_##__FUNC__ __FUNC__ = nullptr
|
||||
|
@ -238,4 +242,5 @@ VKFUNC_DEVICE(vkDestroyDescriptorSetLayout);
|
|||
|
||||
#undef VKFUNC
|
||||
#undef VKFUNC_INSTANCE
|
||||
#undef VKFUNC_DEVICE
|
||||
#undef VKFUNC_DEVICE
|
||||
#undef VKFUNC_DEFINE_CUSTOM
|
||||
|
|
|
@ -2768,6 +2768,8 @@ void VulkanRenderer::NotifyLatteCommandProcessorIdle()
|
|||
SubmitCommandBuffer();
|
||||
}
|
||||
|
||||
void VulkanBenchmarkPrintResults();
|
||||
|
||||
void VulkanRenderer::SwapBuffers(bool swapTV, bool swapDRC)
|
||||
{
|
||||
SubmitCommandBuffer();
|
||||
|
@ -2777,6 +2779,9 @@ void VulkanRenderer::SwapBuffers(bool swapTV, bool swapDRC)
|
|||
|
||||
if (swapDRC && IsSwapchainInfoValid(false))
|
||||
SwapBuffer(false);
|
||||
|
||||
if(swapTV)
|
||||
VulkanBenchmarkPrintResults();
|
||||
}
|
||||
|
||||
void VulkanRenderer::ClearColorbuffer(bool padView)
|
||||
|
|
|
@ -1574,7 +1574,7 @@ void VulkanRenderer::draw_updateVertexBuffersDirectAccess()
|
|||
uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1;
|
||||
uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF;
|
||||
|
||||
if (bufferAddress == MPTR_NULL)
|
||||
if (bufferAddress == MPTR_NULL) [[unlikely]]
|
||||
{
|
||||
bufferAddress = 0x10000000;
|
||||
}
|
||||
|
@ -1597,7 +1597,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader
|
|||
MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0];
|
||||
uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1;
|
||||
|
||||
if (physicalAddr == MPTR_NULL)
|
||||
if (physicalAddr == MPTR_NULL) [[unlikely]]
|
||||
{
|
||||
cemu_assert_unimplemented();
|
||||
continue;
|
||||
|
|
Loading…
Add table
Reference in a new issue