Cemu/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp
2025-03-04 08:34:35 +01:00

2323 lines
83 KiB
C++

#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h"
#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h"
#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h"
#include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h"
#include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h"
#include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h"
#include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h"
#include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h"
#include "Cafe/HW/Latte/Core/LatteShader.h"
#include "Cafe/HW/Latte/Core/LatteIndices.h"
#include "Cafe/HW/Latte/Core/LatteBufferCache.h"
#include "CafeSystem.h"
#include "Cemu/Logging/CemuLogging.h"
#include "Cafe/HW/Latte/Core/FetchShader.h"
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "config/CemuConfig.h"
#include "gui/guiWrapper.h"
#define IMGUI_IMPL_METAL_CPP
#include "imgui/imgui_extension.h"
#include "imgui/imgui_impl_metal.h"
#define EVENT_VALUE_WRAP 4096
extern bool hasValidFramebufferAttached;
float supportBufferData[512 * 4];
// Defined in the OpenGL renderer
void LatteDraw_handleSpecialState8_clearAsDepth();
std::vector<MetalRenderer::DeviceInfo> MetalRenderer::GetDevices()
{
NS_STACK_SCOPED auto devices = MTL::CopyAllDevices();
std::vector<MetalRenderer::DeviceInfo> result;
result.reserve(devices->count());
for (uint32 i = 0; i < devices->count(); i++)
{
MTL::Device* device = static_cast<MTL::Device*>(devices->object(i));
result.emplace_back(std::string(device->name()->utf8String()), device->registryID());
}
return result;
}
MetalRenderer::MetalRenderer()
{
// Options
// Position invariance
switch (g_current_game_profile->GetPositionInvariance())
{
case PositionInvariance::Auto:
switch (CafeSystem::GetForegroundTitleId())
{
// Bayonetta
case 0x0005000010157F00: // EUR
case 0x0005000010157E00: // USA
case 0x000500001014DB00: // JPN
// Bayonetta 2
case 0x0005000010172700: // EUR
case 0x0005000010172600: // USA
// Disney Planes
case 0x0005000010136900: // EUR
case 0x0005000010136A00: // EUR (TODO: check)
case 0x0005000010136B00: // EUR (TODO: check)
case 0x000500001011C500: // USA (TODO: check)
// LEGO STAR WARS: The Force Awakens
case 0x00050000101DAA00: // EUR
case 0x00050000101DAB00: // USA
// Mario Kart 8
case 0x000500001010ED00: // EUR
case 0x000500001010EC00: // USA
case 0x000500001010EB00: // JPN
case 0x0005000010183A00: // JPN (TODO: check)
// Minecraft: Story Mode
case 0x000500001020A300: // EUR
case 0x00050000101E0100: // USA
//case 0x000500001020a200: // USA
// Ninja Gaiden 3: Razor's Edge
case 0x0005000010110B00: // EUR
case 0x0005000010139B00: // EUR (TODO: check)
case 0x0005000010110A00: // USA
case 0x0005000010110900: // JPN
// Resident Evil: Revelations
case 0x000500001012B400: // EUR
case 0x000500001012CF00: // USA
// Star Fox Zero
case 0x00050000101B0500: // EUR
case 0x0005000010201C00: // EUR (TODO: check)
case 0x00050000101B0400: // USA
case 0x0005000010201B00: // USA (TODO: check)
// The Legend of Zelda: Breath of the Wild
case 0x00050000101C9500: // EUR
case 0x00050000101C9400: // USA
case 0x00050000101C9300: // JPN
// Wonderful 101
case 0x0005000010135300: // EUR
case 0x000500001012DC00: // USA
case 0x0005000010116300: // JPN
case 0x0005000010185600: // JPN (TODO: check)
m_positionInvariance = true;
break;
default:
m_positionInvariance = false;
break;
}
break;
case PositionInvariance::False:
m_positionInvariance = false;
break;
case PositionInvariance::True:
m_positionInvariance = true;
break;
}
// Pick a device
auto& config = GetConfig();
const bool hasDeviceSet = config.mtl_graphic_device_uuid != 0;
// If a device is set, try to find it
if (hasDeviceSet)
{
NS_STACK_SCOPED auto devices = MTL::CopyAllDevices();
for (uint32 i = 0; i < devices->count(); i++)
{
MTL::Device* device = static_cast<MTL::Device*>(devices->object(i));
if (device->registryID() == config.mtl_graphic_device_uuid)
{
m_device = device;
break;
}
}
}
if (!m_device)
{
if (hasDeviceSet)
{
cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.mtl_graphic_device_uuid);
config.mtl_graphic_device_uuid = 0;
}
// Use the system default device
m_device = MTL::CreateSystemDefaultDevice();
}
// Vendor
const char* deviceName = m_device->name()->utf8String();
if (memcmp(deviceName, "Apple", 5) == 0)
m_vendor = GfxVendor::Apple;
else if (memcmp(deviceName, "AMD", 3) == 0)
m_vendor = GfxVendor::AMD;
else if (memcmp(deviceName, "Intel", 5) == 0)
m_vendor = GfxVendor::Intel;
else if (memcmp(deviceName, "NVIDIA", 6) == 0)
m_vendor = GfxVendor::Nvidia;
else
m_vendor = GfxVendor::Generic;
// Feature support
m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1);
m_supportsFramebufferFetch = GetConfig().framebuffer_fetch.GetValue() ? m_device->supportsFamily(MTL::GPUFamilyApple2) : false;
m_hasUnifiedMemory = m_device->hasUnifiedMemory();
m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3);
m_supportsMeshShaders = (m_supportsMetal3 && (m_vendor != GfxVendor::Intel || GetConfig().force_mesh_shaders.GetValue())); // Intel GPUs have issues with mesh shaders
m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize();
m_pixelFormatSupport = MetalPixelFormatSupport(m_device);
CheckForPixelFormatSupport(m_pixelFormatSupport);
// Command queue
m_commandQueue = m_device->newCommandQueue();
// Synchronization resources
m_event = m_device->newEvent();
// Resources
NS_STACK_SCOPED MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init();
#ifdef CEMU_DEBUG_ASSERT
samplerDescriptor->setLabel(GetLabel("Nearest sampler state", samplerDescriptor));
#endif
m_nearestSampler = m_device->newSamplerState(samplerDescriptor);
samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear);
samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear);
#ifdef CEMU_DEBUG_ASSERT
samplerDescriptor->setLabel(GetLabel("Linear sampler state", samplerDescriptor));
#endif
m_linearSampler = m_device->newSamplerState(samplerDescriptor);
// Null resources
NS_STACK_SCOPED MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init();
textureDescriptor->setTextureType(MTL::TextureType1D);
textureDescriptor->setWidth(1);
textureDescriptor->setUsage(MTL::TextureUsageShaderRead);
m_nullTexture1D = m_device->newTexture(textureDescriptor);
#ifdef CEMU_DEBUG_ASSERT
m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D));
#endif
textureDescriptor->setTextureType(MTL::TextureType2D);
textureDescriptor->setHeight(1);
textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageRenderTarget);
m_nullTexture2D = m_device->newTexture(textureDescriptor);
#ifdef CEMU_DEBUG_ASSERT
m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D));
#endif
m_memoryManager = new MetalMemoryManager(this);
m_outputShaderCache = new MetalOutputShaderCache(this);
m_pipelineCache = new MetalPipelineCache(this);
m_depthStencilCache = new MetalDepthStencilCache(this);
m_samplerCache = new MetalSamplerCache(this);
// Lower the commit treshold when buffer cache needs reduced latency
if (m_memoryManager->NeedsReducedLatency())
m_defaultCommitTreshlod = 64;
else
m_defaultCommitTreshlod = 196;
// Occlusion queries
m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared);
#ifdef CEMU_DEBUG_ASSERT
m_occlusionQuery.m_resultBuffer->setLabel(GetLabel("Occlusion query result buffer", m_occlusionQuery.m_resultBuffer));
#endif
m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents();
// Reset vertex and uniform buffers
for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++)
m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET;
for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++)
{
for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++)
m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET;
}
// Utility shader library
// Create the library
NS::Error* error = nullptr;
NS_STACK_SCOPED MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error);
if (error)
{
cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String());
}
// Pipelines
NS_STACK_SCOPED MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen"));
NS_STACK_SCOPED MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor"));
m_copyDepthToColorDesc = MTL::RenderPipelineDescriptor::alloc()->init();
m_copyDepthToColorDesc->setVertexFunction(vertexFullscreenFunction);
m_copyDepthToColorDesc->setFragmentFunction(fragmentCopyDepthToColorFunction);
// Void vertex pipelines
if (m_isAppleGPU)
m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer");
// HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false;
m_occlusionQuery.m_lastCommandBuffer = nullptr;
m_captureFrame = false;
}
MetalRenderer::~MetalRenderer()
{
if (m_isAppleGPU)
delete m_copyBufferToBufferPipeline;
//delete m_copyTextureToTexturePipeline;
//delete m_restrideBufferPipeline;
m_copyDepthToColorDesc->release();
for (const auto [pixelFormat, pipeline] : m_copyDepthToColorPipelines)
pipeline->release();
delete m_outputShaderCache;
delete m_pipelineCache;
delete m_depthStencilCache;
delete m_samplerCache;
delete m_memoryManager;
m_nullTexture1D->release();
m_nullTexture2D->release();
m_nearestSampler->release();
m_linearSampler->release();
if (m_readbackBuffer)
m_readbackBuffer->release();
if (m_xfbRingBuffer)
m_xfbRingBuffer->release();
m_occlusionQuery.m_resultBuffer->release();
m_event->release();
m_commandQueue->release();
m_device->release();
}
void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow)
{
auto& layer = GetLayer(mainWindow);
layer = MetalLayerHandle(m_device, size, mainWindow);
layer.GetLayer()->setPixelFormat(MTL::PixelFormatBGRA8Unorm);
}
void MetalRenderer::ShutdownLayer(bool mainWindow)
{
GetLayer(mainWindow) = MetalLayerHandle();
}
void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow)
{
GetLayer(mainWindow).Resize(size);
}
void MetalRenderer::Initialize()
{
Renderer::Initialize();
RendererShaderMtl::Initialize();
}
void MetalRenderer::Shutdown()
{
// TODO: should shutdown both layers
ImGui_ImplMetal_Shutdown();
CommitCommandBuffer();
Renderer::Shutdown();
RendererShaderMtl::Shutdown();
}
bool MetalRenderer::IsPadWindowActive()
{
return (GetLayer(false).GetLayer() != nullptr);
}
bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const
{
// Subtract host memory from total VRAM, since it's shared with the CPU
usageInMB = (m_device->currentAllocatedSize() - m_memoryManager->GetHostAllocationSize()) / 1024 / 1024;
totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024;
return true;
}
void MetalRenderer::ClearColorbuffer(bool padView)
{
if (!AcquireDrawable(!padView))
return;
ClearColorTextureInternal(GetLayer(!padView).GetDrawable()->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f);
}
void MetalRenderer::DrawEmptyFrame(bool mainWindow)
{
if (!BeginFrame(mainWindow))
return;
SwapBuffers(mainWindow, !mainWindow);
}
void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC)
{
if (swapTV)
SwapBuffer(true);
if (swapDRC)
SwapBuffer(false);
// Reset the command buffers (they are released by TemporaryBufferAllocator)
CommitCommandBuffer();
// Debug
m_performanceMonitor.ResetPerFrameData();
// GPU capture
if (m_capturing)
{
EndCapture();
}
else if (m_captureFrame)
{
StartCapture();
m_captureFrame = false;
}
}
void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) {
const bool hasScreenshotRequest = gui_hasScreenshotRequest();
if (!hasScreenshotRequest && m_screenshot_state == ScreenshotState::None)
return;
if (m_mainLayer.GetDrawable())
{
// we already took a pad view screenshow and want a main window screenshot
if (m_screenshot_state == ScreenshotState::Main && padView)
return;
if (m_screenshot_state == ScreenshotState::Pad && !padView)
return;
// remember which screenshot is left to take
if (m_screenshot_state == ScreenshotState::None)
m_screenshot_state = padView ? ScreenshotState::Main : ScreenshotState::Pad;
else
m_screenshot_state = ScreenshotState::None;
}
else
m_screenshot_state = ScreenshotState::None;
auto texMtl = static_cast<LatteTextureMtl*>(texView->baseTexture);
int width, height;
texMtl->GetEffectiveSize(width, height, 0);
uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->isDepth, width);
uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->isDepth, height, bytesPerRow);
auto blitCommandEncoder = GetBlitCommandEncoder();
auto& bufferAllocator = m_memoryManager->GetStagingAllocator();
auto buffer = bufferAllocator.AllocateBufferMemory(size, 1);
blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer.mtlBuffer, buffer.bufferOffset, bytesPerRow, 0);
bool formatValid = true;
std::vector<uint8> rgb_data;
rgb_data.reserve(3 * width * height);
auto pixelFormat = texMtl->GetTexture()->pixelFormat();
// TODO: implement more formats
switch (pixelFormat)
{
case MTL::PixelFormatRGBA8Unorm:
for (auto ptr = buffer.memPtr; ptr < buffer.memPtr + size; ptr += 4)
{
rgb_data.emplace_back(*ptr);
rgb_data.emplace_back(*(ptr + 1));
rgb_data.emplace_back(*(ptr + 2));
}
break;
case MTL::PixelFormatRGBA8Unorm_sRGB:
for (auto ptr = buffer.memPtr; ptr < buffer.memPtr + size; ptr += 4)
{
rgb_data.emplace_back(SRGBComponentToRGB(*ptr));
rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 1)));
rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 2)));
}
break;
default:
cemuLog_log(LogType::Force, "Unsupported screenshot texture pixel format {}", pixelFormat);
formatValid = false;
break;
}
if (formatValid)
SaveScreenshot(rgb_data, width, height, !padView);
}
void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter,
sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight,
bool padView, bool clearBackground)
{
if (!AcquireDrawable(!padView))
return;
MTL::Texture* presentTexture = static_cast<LatteTextureViewMtl*>(texView)->GetRGBAView();
// Create render pass
auto& layer = GetLayer(!padView);
NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0);
colorAttachment->setTexture(layer.GetDrawable()->texture());
colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionLoad);
colorAttachment->setStoreAction(MTL::StoreActionStore);
auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor);
// Get a render pipeline
// Find out which shader we are using
uint8 shaderIndex = 255;
if (shader == RendererOutputShader::s_copy_shader) shaderIndex = 0;
else if (shader == RendererOutputShader::s_bicubic_shader) shaderIndex = 1;
else if (shader == RendererOutputShader::s_hermit_shader) shaderIndex = 2;
else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3;
else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4;
else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5;
uint8 shaderType = shaderIndex % 3;
// Get the render pipeline state
auto renderPipelineState = m_outputShaderCache->GetPipeline(shader, shaderIndex, m_state.m_usesSRGB);
// Draw to Metal layer
renderCommandEncoder->setRenderPipelineState(renderPipelineState);
renderCommandEncoder->setFragmentTexture(presentTexture, 0);
renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0);
// Set uniforms
float outputSize[2] = {(float)imageWidth, (float)imageHeight};
switch (shaderType)
{
case 2:
renderCommandEncoder->setFragmentBytes(outputSize, sizeof(outputSize), 0);
break;
default:
break;
}
renderCommandEncoder->setViewport(MTL::Viewport{(double)imageX, (double)imageY, (double)imageWidth, (double)imageHeight, 0.0, 1.0});
renderCommandEncoder->setScissorRect(MTL::ScissorRect{(uint32)imageX, (uint32)imageY, (uint32)imageWidth, (uint32)imageHeight});
renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3));
EndEncoding();
}
bool MetalRenderer::BeginFrame(bool mainWindow)
{
return AcquireDrawable(mainWindow);
}
void MetalRenderer::Flush(bool waitIdle)
{
if (m_recordedDrawcalls > 0 || waitIdle)
CommitCommandBuffer();
if (waitIdle && m_executingCommandBuffers.size() != 0)
m_executingCommandBuffers.back()->waitUntilCompleted();
}
void MetalRenderer::NotifyLatteCommandProcessorIdle()
{
//if (m_commitOnIdle)
// CommitCommandBuffer();
}
bool MetalRenderer::ImguiBegin(bool mainWindow)
{
if (!Renderer::ImguiBegin(mainWindow))
return false;
if (!AcquireDrawable(mainWindow))
return false;
EnsureImGuiBackend();
// Check if the font texture needs to be built
ImGuiIO& io = ImGui::GetIO();
if (!io.Fonts->IsBuilt())
ImGui_ImplMetal_CreateFontsTexture(m_device);
auto& layer = GetLayer(mainWindow);
// Render pass descriptor
NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0);
colorAttachment->setTexture(layer.GetDrawable()->texture());
colorAttachment->setLoadAction(MTL::LoadActionLoad);
colorAttachment->setStoreAction(MTL::StoreActionStore);
// New frame
ImGui_ImplMetal_NewFrame(renderPassDescriptor);
ImGui_UpdateWindowInformation(mainWindow);
ImGui::NewFrame();
if (m_encoderType != MetalEncoderType::Render)
GetTemporaryRenderCommandEncoder(renderPassDescriptor);
return true;
}
void MetalRenderer::ImguiEnd()
{
EnsureImGuiBackend();
if (m_encoderType != MetalEncoderType::Render)
{
cemuLog_logOnce(LogType::Force, "no render command encoder, cannot draw ImGui");
return;
}
ImGui::Render();
ImGui_ImplMetal_RenderDrawData(ImGui::GetDrawData(), GetCurrentCommandBuffer(), (MTL::RenderCommandEncoder*)m_commandEncoder);
//ImGui::EndFrame();
EndEncoding();
}
ImTextureID MetalRenderer::GenerateTexture(const std::vector<uint8>& data, const Vector2i& size)
{
try
{
std::vector <uint8> tmp(size.x * size.y * 4);
for (size_t i = 0; i < data.size() / 3; ++i)
{
tmp[(i * 4) + 0] = data[(i * 3) + 0];
tmp[(i * 4) + 1] = data[(i * 3) + 1];
tmp[(i * 4) + 2] = data[(i * 3) + 2];
tmp[(i * 4) + 3] = 0xFF;
}
NS_STACK_SCOPED MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init();
desc->setTextureType(MTL::TextureType2D);
desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm);
desc->setWidth(size.x);
desc->setHeight(size.y);
desc->setStorageMode(m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModeManaged);
desc->setUsage(MTL::TextureUsageShaderRead);
MTL::Texture* texture = m_device->newTexture(desc);
// TODO: do a GPU copy?
texture->replaceRegion(MTL::Region(0, 0, size.x, size.y), 0, 0, tmp.data(), size.x * 4, 0);
return (ImTextureID)texture;
}
catch (const std::exception& ex)
{
cemuLog_log(LogType::Force, "can't generate imgui texture: {}", ex.what());
return nullptr;
}
}
void MetalRenderer::DeleteTexture(ImTextureID id)
{
EnsureImGuiBackend();
((MTL::Texture*)id)->release();
}
void MetalRenderer::DeleteFontTextures()
{
EnsureImGuiBackend();
ImGui_ImplMetal_DestroyFontsTexture();
}
void MetalRenderer::AppendOverlayDebugInfo()
{
ImGui::Text("--- GPU info ---");
ImGui::Text("GPU %s", m_device->name()->utf8String());
ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no"));
ImGui::Text("Supports framebuffer fetch %s", (m_supportsFramebufferFetch ? "yes" : "no"));
ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no"));
ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no"));
ImGui::Text("--- Metal info ---");
ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize());
ImGui::Text("--- Metal info (per frame) ---");
ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers);
ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses);
ImGui::Text("Clears %u", m_performanceMonitor.m_clears);
ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws);
ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans);
ImGui::Text("--- Cache debug info ---");
uint32 bufferCacheHeapSize = 0;
uint32 bufferCacheAllocationSize = 0;
uint32 bufferCacheNumAllocations = 0;
LatteBufferCache_getStats(bufferCacheHeapSize, bufferCacheAllocationSize, bufferCacheNumAllocations);
ImGui::Text("Buffer");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Allocs: %u", (uint32)(bufferCacheAllocationSize + 1023) / 1024, ((uint32)bufferCacheHeapSize + 1023) / 1024, (uint32)bufferCacheNumAllocations);
uint32 numBuffers;
size_t totalSize, freeSize;
m_memoryManager->GetStagingAllocator().GetStats(numBuffers, totalSize, freeSize);
ImGui::Text("Staging");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
m_memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
ImGui::Text("Index");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
}
void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ)
{
// halfZ is handled in the shader
m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ};
}
void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight)
{
m_state.m_scissor = MTL::ScissorRect{(uint32)scissorX, (uint32)scissorY, (uint32)scissorWidth, (uint32)scissorHeight};
}
LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key)
{
return new CachedFBOMtl(this, key);
}
void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo)
{
if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO.m_fbo)
m_state.m_activeFBO = {nullptr};
}
void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo)
{
m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)};
m_state.m_fboChanged = true;
}
void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size)
{
return m_memoryManager->AcquireTextureUploadBuffer(size);
}
void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem)
{
m_memoryManager->ReleaseTextureUploadBuffer(mem);
}
TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height)
{
return GetMtlPixelFormatInfo(format, isDepth).textureDecoder;
}
void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex)
{
if (hostTexture->isDepth)
{
texture_clearDepthSlice(hostTexture, sliceIndex, mipIndex, true, hostTexture->hasStencil, 0.0f, 0);
}
else
{
texture_clearColorSlice(hostTexture, sliceIndex, mipIndex, 0.0f, 0.0f, 0.0f, 0.0f);
}
}
// TODO: do a cpu copy on Apple Silicon?
void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize)
{
auto textureMtl = (LatteTextureMtl*)hostTexture;
uint32 offsetZ = 0;
if (textureMtl->Is3DTexture())
{
offsetZ = sliceIndex;
sliceIndex = 0;
}
size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->format, textureMtl->isDepth, width);
// No need to set bytesPerImage for 3D textures, since we always load just one slice
//size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->isDepth, height, bytesPerRow);
//if (m_isAppleGPU)
//{
// textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0);
//}
//else
//{
auto blitCommandEncoder = GetBlitCommandEncoder();
// Allocate a temporary buffer
auto& bufferAllocator = m_memoryManager->GetStagingAllocator();
auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1);
memcpy(allocation.memPtr, pixelData, compressedImageSize);
bufferAllocator.FlushReservation(allocation);
// TODO: specify blit options when copying to a depth stencil texture?
// Copy the data from the temporary buffer to the texture
blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ));
//}
}
void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a)
{
if (!FormatIsRenderable(hostTexture->format))
{
cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format);
return;
}
auto mtlTexture = static_cast<LatteTextureMtl*>(hostTexture)->GetTexture();
ClearColorTextureInternal(mtlTexture, sliceIndex, mipIndex, r, g, b, a);
}
void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue)
{
clearStencil = (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil);
if (!clearDepth && !clearStencil)
{
cemuLog_logOnce(LogType::Force, "skipping depth/stencil clear");
return;
}
auto mtlTexture = static_cast<LatteTextureMtl*>(hostTexture)->GetTexture();
NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
if (clearDepth)
{
auto depthAttachment = renderPassDescriptor->depthAttachment();
depthAttachment->setTexture(mtlTexture);
depthAttachment->setClearDepth(depthValue);
depthAttachment->setLoadAction(MTL::LoadActionClear);
depthAttachment->setStoreAction(MTL::StoreActionStore);
depthAttachment->setSlice(sliceIndex);
depthAttachment->setLevel(mipIndex);
}
if (clearStencil)
{
auto stencilAttachment = renderPassDescriptor->stencilAttachment();
stencilAttachment->setTexture(mtlTexture);
stencilAttachment->setClearStencil(stencilValue);
stencilAttachment->setLoadAction(MTL::LoadActionClear);
stencilAttachment->setStoreAction(MTL::StoreActionStore);
stencilAttachment->setSlice(sliceIndex);
stencilAttachment->setLevel(mipIndex);
}
GetTemporaryRenderCommandEncoder(renderPassDescriptor);
EndEncoding();
// Debug
m_performanceMonitor.m_clears++;
}
LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth)
{
return new LatteTextureMtl(this, dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth);
}
void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit)
{
m_state.m_textures[textureUnit] = static_cast<LatteTextureViewMtl*>(textureView);
}
void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_)
{
// Source size seems to apply to the destination texture as well, therefore we need to adjust it when block size doesn't match
Uvec2 srcBlockTexelSize = GetMtlPixelFormatInfo(src->format, src->isDepth).blockTexelSize;
Uvec2 dstBlockTexelSize = GetMtlPixelFormatInfo(dst->format, dst->isDepth).blockTexelSize;
if (srcBlockTexelSize.x != dstBlockTexelSize.x || srcBlockTexelSize.y != dstBlockTexelSize.y)
{
uint32 multX = (srcBlockTexelSize.x > dstBlockTexelSize.x ? srcBlockTexelSize.x / dstBlockTexelSize.x : dstBlockTexelSize.x / srcBlockTexelSize.x);
effectiveCopyWidth *= multX;
uint32 multY = (srcBlockTexelSize.y > dstBlockTexelSize.y ? srcBlockTexelSize.y / dstBlockTexelSize.y : dstBlockTexelSize.y / srcBlockTexelSize.y);
effectiveCopyHeight *= multY;
}
auto blitCommandEncoder = GetBlitCommandEncoder();
auto mtlSrc = static_cast<LatteTextureMtl*>(src)->GetTexture();
auto mtlDst = static_cast<LatteTextureMtl*>(dst)->GetTexture();
uint32 srcBaseLayer = 0;
uint32 dstBaseLayer = 0;
uint32 srcOffsetZ = 0;
uint32 dstOffsetZ = 0;
uint32 srcLayerCount = 1;
uint32 dstLayerCount = 1;
uint32 srcDepth = 1;
uint32 dstDepth = 1;
if (src->Is3DTexture())
{
srcOffsetZ = srcSlice;
srcDepth = srcDepth_;
}
else
{
srcBaseLayer = srcSlice;
srcLayerCount = srcDepth_;
}
if (dst->Is3DTexture())
{
dstOffsetZ = dstSlice;
dstDepth = srcDepth_;
}
else
{
dstBaseLayer = dstSlice;
dstLayerCount = srcDepth_;
}
// If copying whole textures, we can do a more efficient copy
if (effectiveSrcX == 0 && effectiveSrcY == 0 && effectiveDstX == 0 && effectiveDstY == 0 &&
srcOffsetZ == 0 && dstOffsetZ == 0 &&
effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && srcDepth == src->GetMipDepth(srcMip) &&
effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && dstDepth == dst->GetMipDepth(dstMip) &&
srcLayerCount == dstLayerCount)
{
blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, mtlDst, dstBaseLayer, dstMip, srcLayerCount, 1);
}
else
{
if (srcLayerCount == dstLayerCount)
{
for (uint32 i = 0; i < srcLayerCount; i++)
{
blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, srcDepth), mtlDst, dstBaseLayer + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ));
}
}
else
{
for (uint32 i = 0; i < std::max(srcLayerCount, dstLayerCount); i++)
{
if (srcLayerCount == 1)
srcOffsetZ++;
else
srcSlice++;
if (dstLayerCount == 1)
dstOffsetZ++;
else
dstSlice++;
blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstBaseLayer, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ));
}
}
}
}
LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView)
{
size_t uploadSize = static_cast<LatteTextureMtl*>(textureView->baseTexture)->GetTexture()->allocatedSize();
if ((m_readbackBufferWriteOffset + uploadSize) > TEXTURE_READBACK_SIZE)
{
m_readbackBufferWriteOffset = 0;
}
auto* result = new LatteTextureReadbackInfoMtl(this, textureView, m_readbackBufferWriteOffset);
m_readbackBufferWriteOffset += uploadSize;
return result;
}
void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height)
{
// scale copy size to effective size
sint32 effectiveCopyWidth = width;
sint32 effectiveCopyHeight = height;
LatteTexture_scaleToEffectiveSize(sourceTexture, &effectiveCopyWidth, &effectiveCopyHeight, 0);
//sint32 sourceEffectiveWidth, sourceEffectiveHeight;
//sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip);
texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 1);
}
void MetalRenderer::bufferCache_init(const sint32 bufferSize)
{
m_memoryManager->InitBufferCache(bufferSize);
}
void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset)
{
m_memoryManager->UploadToBufferCache(buffer, bufferOffset, size);
}
void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size)
{
m_memoryManager->CopyBufferCache(srcOffset, dstOffset, size);
}
void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size)
{
if (m_memoryManager->UseHostMemoryForCache())
dstOffset -= m_memoryManager->GetImportedMemBaseAddress();
CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES);
}
void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size)
{
cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache());
cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS);
m_state.m_vertexBufferOffsets[bufferIndex] = offset;
}
void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size)
{
cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache());
m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset;
}
RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader)
{
return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source);
}
void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize)
{
m_state.m_streamoutState.buffers[bufferIndex].enabled = true;
m_state.m_streamoutState.buffers[bufferIndex].ringBufferOffset = ringBufferOffset;
}
void MetalRenderer::streamout_begin()
{
// Do nothing
}
void MetalRenderer::streamout_rendererFinishDrawcall()
{
// Do nothing
}
void MetalRenderer::draw_beginSequence()
{
m_state.m_skipDrawSequence = false;
bool streamoutEnable = LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] != 0;
// update shader state
LatteSHRC_UpdateActiveShaders();
if (LatteGPUState.activeShaderHasError)
{
cemuLog_logOnce(LogType::Force, "Skipping drawcalls due to shader error\n");
m_state.m_skipDrawSequence = true;
cemu_assert_debug(false);
return;
}
// update render target and texture state
LatteGPUState.requiresTextureBarrier = false;
while (true)
{
LatteGPUState.repeatTextureInitialization = false;
if (!LatteMRT::UpdateCurrentFBO())
{
cemuLog_logOnce(LogType::Force, "Rendertarget invalid\n");
m_state.m_skipDrawSequence = true;
return; // no render target
}
if (!hasValidFramebufferAttached && !streamoutEnable)
{
cemuLog_logOnce(LogType::Force, "Drawcall with no color buffer or depth buffer attached\n");
m_state.m_skipDrawSequence = true;
return; // no render target
}
LatteTexture_updateTextures();
if (!LatteGPUState.repeatTextureInitialization)
break;
}
// apply render target
LatteMRT::ApplyCurrentState();
// viewport and scissor box
LatteRenderTarget_updateViewport();
LatteRenderTarget_updateScissorBox();
if (!LatteGPUState.contextNew.IsRasterizationEnabled() && !streamoutEnable)
m_state.m_skipDrawSequence = true;
}
void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst)
{
if (m_state.m_skipDrawSequence)
{
LatteGPUState.drawCallCounter++;
return;
}
// fast clear color as depth
if (LatteGPUState.contextNew.GetSpecialStateValues()[8] != 0)
{
LatteDraw_handleSpecialState8_clearAsDepth();
LatteGPUState.drawCallCounter++;
return;
}
else if (LatteGPUState.contextNew.GetSpecialStateValues()[5] != 0)
{
draw_handleSpecialState5();
LatteGPUState.drawCallCounter++;
return;
}
auto& encoderState = m_state.m_encoderState;
// Shaders
LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader();
LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader();
LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader();
const auto fetchShader = LatteSHRC_GetActiveFetchShader();
/*
bool neverSkipAccurateBarrier = false;
// "Accurate barriers" is usually enabled globally but since the CPU cost is substantial we allow users to disable it (debug -> 'Accurate barriers' option)
// We always force accurate barriers for known problematic shaders
if (pixelShader)
{
if (pixelShader->baseHash == 0x6f6f6e7b9aae57af && pixelShader->auxHash == 0x00078787f9249249) // BotW lava
neverSkipAccurateBarrier = true;
if (pixelShader->baseHash == 0x4c0bd596e3aef4a6 && pixelShader->auxHash == 0x003c3c3fc9269249) // BotW foam layer for water on the bottom of waterfalls
neverSkipAccurateBarrier = true;
}
// Check if we need to end the render pass
if (!m_state.m_isFirstDrawInRenderPass && (GetConfig().vk_accurate_barriers || neverSkipAccurateBarrier))
{
// Fragment shader is most likely to require a render pass flush, so check for it first
bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader);
if (!endRenderPass)
endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader);
if (!endRenderPass && geometryShader)
endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader);
if (endRenderPass)
{
EndEncoding();
// TODO: only log in debug?
cemuLog_logOnce(LogType::Force, "Ending render pass due to render target self-dependency\n");
}
}
*/
// Primitive type
const LattePrimitiveMode primitiveMode = LatteGPUState.contextNew.VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE();
auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode);
bool usesGeometryShader = UseGeometryShader(LatteGPUState.contextNew, geometryShader != nullptr);
if (usesGeometryShader && !m_supportsMeshShaders)
return;
bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually);
// Index buffer
Renderer::INDEX_TYPE hostIndexType;
uint32 hostIndexCount;
uint32 indexMin = 0;
uint32 indexMax = 0;
Renderer::IndexAllocation indexAllocation;
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
auto indexAllocationMtl = static_cast<MetalSynchronizedHeapAllocator::AllocatorReservation*>(indexAllocation.rendererInternal);
// Buffer cache
if (m_memoryManager->UseHostMemoryForCache())
{
// direct memory access (Wii U memory space imported as a buffer), update buffer bindings
draw_updateVertexBuffersDirectAccess();
if (vertexShader)
draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START);
if (geometryShader)
draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START);
if (pixelShader)
draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START);
}
else
{
// synchronize vertex and uniform cache and update buffer bindings
// We need to call this before getting the render command encoder, since it can cause buffer copies
LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount);
}
// Render pass
auto renderCommandEncoder = GetRenderCommandEncoder();
// Render pipeline state
PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, m_state.m_activeFBO.m_fbo->m_size, count, LatteGPUState.contextNew);
if (!pipelineObj->m_pipeline)
return;
if (pipelineObj->m_pipeline != encoderState.m_renderPipelineState)
{
renderCommandEncoder->setRenderPipelineState(pipelineObj->m_pipeline);
encoderState.m_renderPipelineState = pipelineObj->m_pipeline;
}
// Depth stencil state
// Disable depth write when there is no depth attachment
auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL;
bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE();
if (!m_state.m_activeFBO.m_fbo->depthBuffer.texture)
depthControl.set_Z_WRITE_ENABLE(false);
MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew);
if (depthStencilState != encoderState.m_depthStencilState)
{
renderCommandEncoder->setDepthStencilState(depthStencilState);
encoderState.m_depthStencilState = depthStencilState;
}
// Restore the original depth write state
depthControl.set_Z_WRITE_ENABLE(depthWriteEnable);
// Stencil reference
bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE();
if (stencilEnable)
{
bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE();
uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F();
uint32 stencilRefBack;
if (backStencilEnable)
stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B();
else
stencilRefBack = stencilRefFront;
if (stencilRefFront != encoderState.m_stencilRefFront || stencilRefBack != encoderState.m_stencilRefBack)
{
renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack);
encoderState.m_stencilRefFront = stencilRefFront;
encoderState.m_stencilRefBack = stencilRefBack;
}
}
// Blend color
uint32* blendColorConstantU32 = LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED;
if (blendColorConstantU32[0] != encoderState.m_blendColor[0] || blendColorConstantU32[1] != encoderState.m_blendColor[1] || blendColorConstantU32[2] != encoderState.m_blendColor[2] || blendColorConstantU32[3] != encoderState.m_blendColor[3])
{
float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED;
renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]);
encoderState.m_blendColor[0] = blendColorConstantU32[0];
encoderState.m_blendColor[1] = blendColorConstantU32[1];
encoderState.m_blendColor[2] = blendColorConstantU32[2];
encoderState.m_blendColor[3] = blendColorConstantU32[3];
}
// polygon control
const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL;
const auto frontFace = polygonControlReg.get_FRONT_FACE();
uint32 cullFront = polygonControlReg.get_CULL_FRONT();
uint32 cullBack = polygonControlReg.get_CULL_BACK();
uint32 polyOffsetFrontEnable = polygonControlReg.get_OFFSET_FRONT_ENABLED();
if (polyOffsetFrontEnable)
{
uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue();
uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue();
uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue();
if (frontOffsetU32 != encoderState.m_depthBias || frontScaleU32 != encoderState.m_depthSlope || offsetClampU32 != encoderState.m_depthClamp)
{
float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE();
float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET();
float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP();
frontScale /= 16.0f;
renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp);
encoderState.m_depthBias = frontOffsetU32;
encoderState.m_depthSlope = frontScaleU32;
encoderState.m_depthClamp = offsetClampU32;
}
}
else
{
if (0 != encoderState.m_depthBias || 0 != encoderState.m_depthSlope || 0 != encoderState.m_depthClamp)
{
renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f);
encoderState.m_depthBias = 0;
encoderState.m_depthSlope = 0;
encoderState.m_depthClamp = 0;
}
}
// Depth clip mode
cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually
bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false;
if (zClipEnable != encoderState.m_depthClipEnable)
{
renderCommandEncoder->setDepthClipMode(zClipEnable ? MTL::DepthClipModeClip : MTL::DepthClipModeClamp);
encoderState.m_depthClipEnable = zClipEnable;
}
// Visibility result mode
if (m_occlusionQuery.m_active)
{
auto mode = (m_occlusionQuery.m_currentIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting);
renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_currentIndex * sizeof(uint64));
}
// todo - how does culling behave with rects?
// right now we just assume that their winding is always CW
if (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS)
{
if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CW)
cullFront = cullBack;
else
cullBack = cullFront;
}
// Cull mode
// Cull front and back is handled by disabling rasterization
if (!(cullFront && cullBack))
{
MTL::CullMode cullMode;
if (cullFront)
cullMode = MTL::CullModeFront;
else if (cullBack)
cullMode = MTL::CullModeBack;
else
cullMode = MTL::CullModeNone;
if (cullMode != encoderState.m_cullMode)
{
renderCommandEncoder->setCullMode(cullMode);
encoderState.m_cullMode = cullMode;
}
}
// Front face
MTL::Winding frontFaceWinding;
if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW)
frontFaceWinding = MTL::WindingCounterClockwise;
else
frontFaceWinding = MTL::WindingClockwise;
if (frontFaceWinding != encoderState.m_frontFaceWinding)
{
renderCommandEncoder->setFrontFacingWinding(frontFaceWinding);
encoderState.m_frontFaceWinding = frontFaceWinding;
}
// Viewport
if (m_state.m_viewport.originX != encoderState.m_viewport.originX ||
m_state.m_viewport.originY != encoderState.m_viewport.originY ||
m_state.m_viewport.width != encoderState.m_viewport.width ||
m_state.m_viewport.height != encoderState.m_viewport.height ||
m_state.m_viewport.znear != encoderState.m_viewport.znear ||
m_state.m_viewport.zfar != encoderState.m_viewport.zfar)
{
renderCommandEncoder->setViewport(m_state.m_viewport);
encoderState.m_viewport = m_state.m_viewport;
}
// Scissor
if (m_state.m_scissor.x != encoderState.m_scissor.x ||
m_state.m_scissor.y != encoderState.m_scissor.y ||
m_state.m_scissor.width != encoderState.m_scissor.width ||
m_state.m_scissor.height != encoderState.m_scissor.height)
{
encoderState.m_scissor = m_state.m_scissor;
// TODO: clamp scissor to render target dimensions?
//scissor.width = ;
//scissor.height = ;
renderCommandEncoder->setScissorRect(encoderState.m_scissor);
}
// Resources
// Vertex buffers
for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++)
{
size_t offset = m_state.m_vertexBufferOffsets[i];
if (offset != INVALID_OFFSET)
{
// Bind
SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i));
}
}
// Prepare streamout
m_state.m_streamoutState.verticesPerInstance = count;
LatteStreamout_PrepareDrawcall(count, instanceCount);
// Uniform buffers, textures and samplers
BindStageResources(renderCommandEncoder, vertexShader, usesGeometryShader);
if (usesGeometryShader && geometryShader)
BindStageResources(renderCommandEncoder, geometryShader, usesGeometryShader);
BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader);
// Draw
if (usesGeometryShader)
{
if (hostIndexType != INDEX_TYPE::NONE)
SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, vertexShader->resourceMapping.indexBufferBinding);
uint8 hostIndexTypeU8 = (uint8)hostIndexType;
renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding);
encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr};
uint32 verticesPerPrimitive = GetVerticesPerPrimitive(primitiveMode);
uint32 threadgroupCount = count * instanceCount;
if (PrimitiveRequiresConnection(primitiveMode))
threadgroupCount -= verticesPerPrimitive - 1;
else
threadgroupCount /= verticesPerPrimitive;
renderCommandEncoder->drawMeshThreadgroups(MTL::Size(threadgroupCount, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1));
}
else
{
if (hostIndexType != INDEX_TYPE::NONE)
{
auto mtlIndexType = GetMtlIndexType(hostIndexType);
renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, instanceCount, baseVertex, baseInstance);
}
else
{
renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance);
}
}
m_state.m_isFirstDrawInRenderPass = false;
// Occlusion queries
if (m_occlusionQuery.m_active)
m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE;
// Streamout
LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache());
// Debug
if (fetchVertexManually)
m_performanceMonitor.m_manualVertexFetchDraws++;
if (usesGeometryShader)
m_performanceMonitor.m_meshDraws++;
if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN)
m_performanceMonitor.m_triangleFans++;
LatteGPUState.drawCallCounter++;
}
void MetalRenderer::draw_endSequence()
{
LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader();
// post-drawcall logic
if (pixelShader)
LatteRenderTarget_trackUpdates();
bool hasReadback = LatteTextureReadback_Update();
m_recordedDrawcalls++;
// The number of draw calls needs to twice as big, since we are interrupting the render pass
// TODO: ucomment?
if (m_recordedDrawcalls >= m_commitTreshold * 2/* || hasReadback*/)
{
CommitCommandBuffer();
// TODO: where should this be called?
LatteTextureReadback_UpdateFinishedTransfers(false);
}
}
void MetalRenderer::draw_updateVertexBuffersDirectAccess()
{
LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader();
if (!parsedFetchShader)
return;
for (auto& bufferGroup : parsedFetchShader->bufferGroups)
{
uint32 bufferIndex = bufferGroup.attributeBufferIndex;
uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7;
MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0];
if (bufferAddress == MPTR_NULL) [[unlikely]]
bufferAddress = m_memoryManager->GetImportedMemBaseAddress();
m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress();
}
}
void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset)
{
if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK)
{
for (const auto& buf : shader->list_quickBufferList)
{
sint32 i = buf.index;
MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0];
uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1;
if (physicalAddr == MPTR_NULL) [[unlikely]]
{
cemu_assert_unimplemented();
continue;
}
uniformSize = std::min<uint32>(uniformSize, buf.size);
cemu_assert_debug(physicalAddr < 0x50000000);
uint32 bufferIndex = i;
cemu_assert_debug(bufferIndex < 16);
m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress();
}
}
}
void MetalRenderer::draw_handleSpecialState5()
{
LatteMRT::UpdateCurrentFBO();
LatteRenderTarget_updateViewport();
LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0);
LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment();
auto colorTextureMtl = static_cast<LatteTextureViewMtl*>(colorBuffer);
auto depthTextureMtl = static_cast<LatteTextureViewMtl*>(depthBuffer);
sint32 vpWidth, vpHeight;
LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight);
// Get the pipeline
MTL::PixelFormat colorPixelFormat = colorTextureMtl->GetRGBAView()->pixelFormat();
auto& pipeline = m_copyDepthToColorPipelines[colorPixelFormat];
if (!pipeline)
{
m_copyDepthToColorDesc->colorAttachments()->object(0)->setPixelFormat(colorPixelFormat);
NS::Error* error = nullptr;
pipeline = m_device->newRenderPipelineState(m_copyDepthToColorDesc, &error);
if (error)
{
cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String());
}
}
// Sadly, we need to end encoding to ensure that the depth data is up-to-date
EndEncoding();
// Copy depth to color
auto renderCommandEncoder = GetRenderCommandEncoder();
auto& encoderState = m_state.m_encoderState;
renderCommandEncoder->setRenderPipelineState(pipeline);
// TODO: make a helper function for this
encoderState.m_renderPipelineState = pipeline;
SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, depthTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0));
// TODO: make a helper function for this
renderCommandEncoder->setFragmentBytes(&vpWidth, sizeof(sint32), GET_HELPER_BUFFER_BINDING(0));
encoderState.m_buffers[METAL_SHADER_TYPE_FRAGMENT][GET_HELPER_BUFFER_BINDING(0)] = {nullptr};
renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3));
}
Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size)
{
auto allocation = m_memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 128);
return {allocation->memPtr, allocation};
}
void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
{
m_memoryManager->GetIndexAllocator().FreeReservation(static_cast<MetalSynchronizedHeapAllocator::AllocatorReservation*>(allocation.rendererInternal));
}
void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
{
m_memoryManager->GetIndexAllocator().FlushReservation(static_cast<MetalSynchronizedHeapAllocator::AllocatorReservation*>(allocation.rendererInternal));
}
LatteQueryObject* MetalRenderer::occlusionQuery_create() {
return new LatteQueryObjectMtl(this);
}
void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) {
auto queryObjMtl = static_cast<LatteQueryObjectMtl*>(queryObj);
delete queryObjMtl;
}
void MetalRenderer::occlusionQuery_flush() {
if (m_occlusionQuery.m_lastCommandBuffer)
m_occlusionQuery.m_lastCommandBuffer->waitUntilCompleted();
}
void MetalRenderer::occlusionQuery_updateState() {
ProcessFinishedCommandBuffers();
}
void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index)
{
auto& boundBuffer = m_state.m_encoderState.m_buffers[shaderType][index];
if (buffer == boundBuffer.m_buffer && offset == boundBuffer.m_offset)
return;
if (buffer == boundBuffer.m_buffer)
{
// Update just the offset
boundBuffer.m_offset = offset;
switch (shaderType)
{
case METAL_SHADER_TYPE_VERTEX:
renderCommandEncoder->setVertexBufferOffset(offset, index);
break;
case METAL_SHADER_TYPE_OBJECT:
renderCommandEncoder->setObjectBufferOffset(offset, index);
break;
case METAL_SHADER_TYPE_MESH:
renderCommandEncoder->setMeshBufferOffset(offset, index);
break;
case METAL_SHADER_TYPE_FRAGMENT:
renderCommandEncoder->setFragmentBufferOffset(offset, index);
break;
}
return;
}
boundBuffer = {buffer, offset};
switch (shaderType)
{
case METAL_SHADER_TYPE_VERTEX:
renderCommandEncoder->setVertexBuffer(buffer, offset, index);
break;
case METAL_SHADER_TYPE_OBJECT:
renderCommandEncoder->setObjectBuffer(buffer, offset, index);
break;
case METAL_SHADER_TYPE_MESH:
renderCommandEncoder->setMeshBuffer(buffer, offset, index);
break;
case METAL_SHADER_TYPE_FRAGMENT:
renderCommandEncoder->setFragmentBuffer(buffer, offset, index);
break;
}
}
void MetalRenderer::SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index)
{
auto& boundTexture = m_state.m_encoderState.m_textures[shaderType][index];
if (texture == boundTexture)
return;
boundTexture = texture;
switch (shaderType)
{
case METAL_SHADER_TYPE_VERTEX:
renderCommandEncoder->setVertexTexture(texture, index);
break;
case METAL_SHADER_TYPE_OBJECT:
renderCommandEncoder->setObjectTexture(texture, index);
break;
case METAL_SHADER_TYPE_MESH:
renderCommandEncoder->setMeshTexture(texture, index);
break;
case METAL_SHADER_TYPE_FRAGMENT:
renderCommandEncoder->setFragmentTexture(texture, index);
break;
}
}
void MetalRenderer::SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index)
{
auto& boundSamplerState = m_state.m_encoderState.m_samplers[shaderType][index];
if (samplerState == boundSamplerState)
return;
boundSamplerState = samplerState;
switch (shaderType)
{
case METAL_SHADER_TYPE_VERTEX:
renderCommandEncoder->setVertexSamplerState(samplerState, index);
break;
case METAL_SHADER_TYPE_OBJECT:
renderCommandEncoder->setObjectSamplerState(samplerState, index);
break;
case METAL_SHADER_TYPE_MESH:
renderCommandEncoder->setMeshSamplerState(samplerState, index);
break;
case METAL_SHADER_TYPE_FRAGMENT:
renderCommandEncoder->setFragmentSamplerState(samplerState, index);
break;
}
}
MTL::CommandBuffer* MetalRenderer::GetCommandBuffer()
{
bool needsNewCommandBuffer = (!m_currentCommandBuffer.m_commandBuffer || m_currentCommandBuffer.m_commited);
if (needsNewCommandBuffer)
{
// Debug
//m_commandQueue->insertDebugCaptureBoundary();
MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer();
m_currentCommandBuffer = {mtlCommandBuffer};
// Wait for the previous command buffer
if (m_eventValue != -1)
mtlCommandBuffer->encodeWait(m_event, m_eventValue);
m_recordedDrawcalls = 0;
m_commitTreshold = m_defaultCommitTreshlod;
// Debug
m_performanceMonitor.m_commandBuffers++;
return mtlCommandBuffer;
}
else
{
return m_currentCommandBuffer.m_commandBuffer;
}
}
MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor)
{
EndEncoding();
auto commandBuffer = GetCommandBuffer();
auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor);
#ifdef CEMU_DEBUG_ASSERT
renderCommandEncoder->setLabel(GetLabel("Temporary render command encoder", renderCommandEncoder));
#endif
m_commandEncoder = renderCommandEncoder;
m_encoderType = MetalEncoderType::Render;
// Debug
m_performanceMonitor.m_renderPasses++;
return renderCommandEncoder;
}
// Some render passes clear the attachments, forceRecreate is supposed to be used in those cases
MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate)
{
bool fboChanged = m_state.m_fboChanged;
m_state.m_fboChanged = false;
// Check if we need to begin a new render pass
if (m_commandEncoder)
{
if (!forceRecreate)
{
if (m_encoderType == MetalEncoderType::Render)
{
bool needsNewRenderPass = false;
if (fboChanged)
{
needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr);
if (!needsNewRenderPass)
{
for (uint8 i = 0; i < 8; i++)
{
if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture)
{
needsNewRenderPass = true;
break;
}
}
}
if (!needsNewRenderPass)
{
if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil)))
{
needsNewRenderPass = true;
}
}
}
if (!needsNewRenderPass)
{
return (MTL::RenderCommandEncoder*)m_commandEncoder;
}
}
}
EndEncoding();
}
auto commandBuffer = GetCommandBuffer();
auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO.m_fbo->GetRenderPassDescriptor());
#ifdef CEMU_DEBUG_ASSERT
renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder));
#endif
m_commandEncoder = renderCommandEncoder;
m_encoderType = MetalEncoderType::Render;
// Update state
m_state.m_lastUsedFBO = m_state.m_activeFBO;
m_state.m_isFirstDrawInRenderPass = true;
ResetEncoderState();
// Debug
m_performanceMonitor.m_renderPasses++;
return renderCommandEncoder;
}
MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder()
{
if (m_commandEncoder)
{
if (m_encoderType == MetalEncoderType::Compute)
{
return (MTL::ComputeCommandEncoder*)m_commandEncoder;
}
EndEncoding();
}
auto commandBuffer = GetCommandBuffer();
auto computeCommandEncoder = commandBuffer->computeCommandEncoder();
m_commandEncoder = computeCommandEncoder;
m_encoderType = MetalEncoderType::Compute;
ResetEncoderState();
return computeCommandEncoder;
}
MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder()
{
if (m_commandEncoder)
{
if (m_encoderType == MetalEncoderType::Blit)
{
return (MTL::BlitCommandEncoder*)m_commandEncoder;
}
EndEncoding();
}
auto commandBuffer = GetCommandBuffer();
auto blitCommandEncoder = commandBuffer->blitCommandEncoder();
m_commandEncoder = blitCommandEncoder;
m_encoderType = MetalEncoderType::Blit;
ResetEncoderState();
return blitCommandEncoder;
}
void MetalRenderer::EndEncoding()
{
if (m_commandEncoder)
{
m_commandEncoder->endEncoding();
m_commandEncoder->release();
m_commandEncoder = nullptr;
m_encoderType = MetalEncoderType::None;
// Commit the command buffer if enough draw calls have been recorded
if (m_recordedDrawcalls >= m_commitTreshold)
CommitCommandBuffer();
}
}
void MetalRenderer::CommitCommandBuffer()
{
if (!m_currentCommandBuffer.m_commandBuffer)
return;
EndEncoding();
ProcessFinishedCommandBuffers();
// Commit the command buffer
if (!m_currentCommandBuffer.m_commited)
{
// Handled differently, since it seems like Metal doesn't always call the completion handler
//commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) {
// m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer);
//});
// Signal event
m_eventValue = (m_eventValue + 1) % EVENT_VALUE_WRAP;
auto mtlCommandBuffer = m_currentCommandBuffer.m_commandBuffer;
mtlCommandBuffer->encodeSignalEvent(m_event, m_eventValue);
mtlCommandBuffer->commit();
m_currentCommandBuffer.m_commited = true;
m_executingCommandBuffers.push_back(mtlCommandBuffer);
// Debug
//m_commandQueue->insertDebugCaptureBoundary();
}
}
void MetalRenderer::ProcessFinishedCommandBuffers()
{
// Check for finished command buffers
for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();)
{
auto commandBuffer = *it;
if (CommandBufferCompleted(commandBuffer))
{
m_memoryManager->CleanupBuffers(commandBuffer);
commandBuffer->release();
it = m_executingCommandBuffers.erase(it);
}
else
{
++it;
}
}
}
bool MetalRenderer::AcquireDrawable(bool mainWindow)
{
auto& layer = GetLayer(mainWindow);
if (!layer.GetLayer())
return false;
const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB;
if (latteBufferUsesSRGB != m_state.m_usesSRGB)
{
layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm);
m_state.m_usesSRGB = latteBufferUsesSRGB;
}
return layer.AcquireDrawable();
}
/*
bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader)
{
sint32 textureCount = shader->resourceMapping.getTextureCount();
for (int i = 0; i < textureCount; ++i)
{
const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i);
auto hostTextureUnit = relative_textureUnit;
auto textureDim = shader->textureUnitDim[relative_textureUnit];
// Texture is accessed as a framebuffer fetch, therefore there is no need to flush it
if (shader->textureRenderTargetIndex[relative_textureUnit] != 255)
continue;
auto texUnitRegIndex = hostTextureUnit * 7;
switch (shader->shaderType)
{
case LatteConst::ShaderType::Vertex:
hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE;
texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS;
break;
case LatteConst::ShaderType::Pixel:
hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE;
texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS;
break;
case LatteConst::ShaderType::Geometry:
hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE;
texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS;
break;
default:
UNREACHABLE;
}
auto textureView = m_state.m_textures[hostTextureUnit];
if (!textureView)
continue;
LatteTexture* baseTexture = textureView->baseTexture;
// If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture
for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++)
{
auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture;
if (colorTarget && colorTarget->baseTexture == baseTexture)
return true;
}
}
return false;
}
*/
void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader)
{
auto mtlShaderType = GetMtlShaderType(shader->shaderType, usesGeometryShader);
sint32 textureCount = shader->resourceMapping.getTextureCount();
for (int i = 0; i < textureCount; ++i)
{
const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i);
auto hostTextureUnit = relative_textureUnit;
// Don't bind textures that are accessed with a framebuffer fetch
if (m_supportsFramebufferFetch && shader->textureRenderTargetIndex[relative_textureUnit] != 255)
continue;
auto textureDim = shader->textureUnitDim[relative_textureUnit];
auto texUnitRegIndex = hostTextureUnit * 7;
switch (shader->shaderType)
{
case LatteConst::ShaderType::Vertex:
hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE;
texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS;
break;
case LatteConst::ShaderType::Pixel:
hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE;
texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS;
break;
case LatteConst::ShaderType::Geometry:
hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE;
texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS;
break;
default:
UNREACHABLE;
}
// TODO: correct?
uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i;
if (binding >= MAX_MTL_TEXTURES)
{
cemuLog_logOnce(LogType::Force, "invalid texture binding {}", binding);
continue;
}
auto textureView = m_state.m_textures[hostTextureUnit];
if (!textureView)
{
if (textureDim == Latte::E_DIM::DIM_1D)
SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding);
else
SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding);
SetSamplerState(renderCommandEncoder, mtlShaderType, m_nearestSampler, binding);
continue;
}
if (textureDim == Latte::E_DIM::DIM_1D && (textureView->dim != Latte::E_DIM::DIM_1D))
{
SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding);
continue;
}
else if (textureDim == Latte::E_DIM::DIM_2D && (textureView->dim != Latte::E_DIM::DIM_2D && textureView->dim != Latte::E_DIM::DIM_2D_MSAA))
{
SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding);
continue;
}
LatteTexture* baseTexture = textureView->baseTexture;
uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit];
MTL::SamplerState* sampler;
if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE)
{
uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType);
_LatteRegisterSetSampler* samplerWords = LatteGPUState.contextNew.SQ_TEX_SAMPLER + samplerIndex;
// Overwriting
// Lod bias
//if (baseTexture->overwriteInfo.hasLodBias)
// samplerWords->WORD1.set_LOD_BIAS(baseTexture->overwriteInfo.lodBias);
//else if (baseTexture->overwriteInfo.hasRelativeLodBias)
// samplerWords->WORD1.set_LOD_BIAS(samplerWords->WORD1.get_LOD_BIAS() + baseTexture->overwriteInfo.relativeLodBias);
// Max anisotropy
if (baseTexture->overwriteInfo.anisotropicLevel >= 0)
samplerWords->WORD0.set_MAX_ANISO_RATIO(baseTexture->overwriteInfo.anisotropicLevel);
sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, shader->shaderType, stageSamplerIndex, samplerWords);
}
else
{
sampler = m_nearestSampler;
}
SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding);
// get texture register word 0
uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4];
auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding];
MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4);
SetTexture(renderCommandEncoder, mtlShaderType, mtlTexture, binding);
}
// Support buffer
auto GET_UNIFORM_DATA_PTR = [&](size_t index) { return supportBufferData + (index / 4); };
sint32 shaderAluConst;
sint32 shaderUniformRegisterOffset;
switch (shader->shaderType)
{
case LatteConst::ShaderType::Vertex:
shaderAluConst = 0x400;
shaderUniformRegisterOffset = mmSQ_VTX_UNIFORM_BLOCK_START;
break;
case LatteConst::ShaderType::Pixel:
shaderAluConst = 0;
shaderUniformRegisterOffset = mmSQ_PS_UNIFORM_BLOCK_START;
break;
case LatteConst::ShaderType::Geometry:
shaderAluConst = 0; // geometry shader has no ALU const
shaderUniformRegisterOffset = mmSQ_GS_UNIFORM_BLOCK_START;
break;
default:
UNREACHABLE;
}
if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0)
{
if (shader->uniform.list_ufTexRescale.empty() == false)
{
for (auto& entry : shader->uniform.list_ufTexRescale)
{
float* xyScale = LatteTexture_getEffectiveTextureScale(shader->shaderType, entry.texUnit);
memcpy(entry.currentValue, xyScale, sizeof(float) * 2);
memcpy(GET_UNIFORM_DATA_PTR(entry.uniformLocation), xyScale, sizeof(float) * 2);
}
}
if (shader->uniform.loc_alphaTestRef >= 0)
{
*GET_UNIFORM_DATA_PTR(shader->uniform.loc_alphaTestRef) = LatteGPUState.contextNew.SX_ALPHA_REF.get_ALPHA_TEST_REF();
}
if (shader->uniform.loc_pointSize >= 0)
{
const auto& pointSizeReg = LatteGPUState.contextNew.PA_SU_POINT_SIZE;
float pointWidth = (float)pointSizeReg.get_WIDTH() / 8.0f;
if (pointWidth == 0.0f)
pointWidth = 1.0f / 8.0f; // minimum size
*GET_UNIFORM_DATA_PTR(shader->uniform.loc_pointSize) = pointWidth;
}
if (shader->uniform.loc_remapped >= 0)
{
LatteBufferCache_LoadRemappedUniforms(shader, GET_UNIFORM_DATA_PTR(shader->uniform.loc_remapped));
}
if (shader->uniform.loc_uniformRegister >= 0)
{
uint32* uniformRegData = (uint32*)(LatteGPUState.contextRegister + mmSQ_ALU_CONSTANT0_0 + shaderAluConst);
memcpy(GET_UNIFORM_DATA_PTR(shader->uniform.loc_uniformRegister), uniformRegData, shader->uniform.count_uniformRegister * 16);
}
if (shader->uniform.loc_windowSpaceToClipSpaceTransform >= 0)
{
sint32 viewportWidth;
sint32 viewportHeight;
LatteRenderTarget_GetCurrentVirtualViewportSize(&viewportWidth, &viewportHeight); // always call after _updateViewport()
float* v = GET_UNIFORM_DATA_PTR(shader->uniform.loc_windowSpaceToClipSpaceTransform);
v[0] = 2.0f / (float)viewportWidth;
v[1] = 2.0f / (float)viewportHeight;
}
if (shader->uniform.loc_fragCoordScale >= 0)
{
LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale));
}
if (shader->uniform.loc_verticesPerInstance >= 0)
{
*(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_state.m_streamoutState.verticesPerInstance;
for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++)
{
if (shader->uniform.loc_streamoutBufferBase[b] >= 0)
{
*(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_state.m_streamoutState.buffers[b].ringBufferOffset;
}
}
}
size_t size = shader->uniform.uniformRangeSize;
auto& bufferAllocator = m_memoryManager->GetStagingAllocator();
auto allocation = bufferAllocator.AllocateBufferMemory(size, 1);
memcpy(allocation.memPtr, supportBufferData, size);
bufferAllocator.FlushReservation(allocation);
SetBuffer(renderCommandEncoder, mtlShaderType, allocation.mtlBuffer, allocation.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint);
}
// Uniform buffers
for (sint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0)
{
uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i];
if (binding >= MAX_MTL_BUFFERS)
{
cemuLog_logOnce(LogType::Force, "invalid buffer binding {}", binding);
continue;
}
size_t offset = m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][i];
if (offset == INVALID_OFFSET)
continue;
SetBuffer(renderCommandEncoder, mtlShaderType, m_memoryManager->GetBufferCache(), offset, binding);
}
}
// Storage buffer
if (shader->resourceMapping.tfStorageBindingPoint >= 0)
{
SetBuffer(renderCommandEncoder, mtlShaderType, m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint);
}
}
void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a)
{
NS_STACK_SCOPED MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0);
colorAttachment->setTexture(mtlTexture);
colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a));
colorAttachment->setLoadAction(MTL::LoadActionClear);
colorAttachment->setStoreAction(MTL::StoreActionStore);
colorAttachment->setSlice(sliceIndex);
colorAttachment->setLevel(mipIndex);
GetTemporaryRenderCommandEncoder(renderPassDescriptor);
EndEncoding();
// Debug
m_performanceMonitor.m_clears++;
}
void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before)
{
// TODO: uncomment and fix performance issues
// Do the copy in a vertex shader on Apple GPUs
/*
if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render)
{
auto renderCommandEncoder = static_cast<MTL::RenderCommandEncoder*>(m_commandEncoder);
MTL::Resource* barrierBuffers[] = {src};
renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, after | MTL::RenderStageVertex);
renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState());
m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState();
SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, src, srcOffset, GET_HELPER_BUFFER_BINDING(0));
SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dst, dstOffset, GET_HELPER_BUFFER_BINDING(1));
renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size));
barrierBuffers[0] = dst;
renderCommandEncoder->memoryBarrier(barrierBuffers, 1, before | MTL::RenderStageVertex, before);
}
else
{
*/
auto blitCommandEncoder = GetBlitCommandEncoder();
blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size);
//}
}
void MetalRenderer::SwapBuffer(bool mainWindow)
{
if (!AcquireDrawable(mainWindow))
return;
auto commandBuffer = GetCommandBuffer();
GetLayer(mainWindow).PresentDrawable(commandBuffer);
}
void MetalRenderer::EnsureImGuiBackend()
{
if (!ImGui::GetIO().BackendRendererUserData)
{
ImGui_ImplMetal_Init(m_device);
//ImGui_ImplMetal_CreateFontsTexture(m_device);
}
}
void MetalRenderer::StartCapture()
{
auto captureManager = MTL::CaptureManager::sharedCaptureManager();
auto desc = MTL::CaptureDescriptor::alloc()->init();
desc->setCaptureObject(m_device);
// Check if a debugger with support for GPU capture is attached
if (captureManager->supportsDestination(MTL::CaptureDestinationDeveloperTools))
{
desc->setDestination(MTL::CaptureDestinationDeveloperTools);
}
else
{
if (GetConfig().gpu_capture_dir.GetValue().empty())
{
cemuLog_log(LogType::Force, "No GPU capture directory specified, cannot do a GPU capture");
return;
}
// Check if the GPU trace document destination is available
if (!captureManager->supportsDestination(MTL::CaptureDestinationGPUTraceDocument))
{
cemuLog_log(LogType::Force, "GPU trace document destination is not available, cannot do a GPU capture");
return;
}
// Get current date and time as a string
auto now = std::chrono::system_clock::now();
std::time_t now_time = std::chrono::system_clock::to_time_t(now);
std::ostringstream oss;
oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d_%H-%M-%S");
std::string now_str = oss.str();
std::string capturePath = fmt::format("{}/cemu_{}.gputrace", GetConfig().gpu_capture_dir.GetValue(), now_str);
desc->setDestination(MTL::CaptureDestinationGPUTraceDocument);
desc->setOutputURL(ToNSURL(capturePath));
}
NS::Error* error = nullptr;
captureManager->startCapture(desc, &error);
if (error)
{
cemuLog_log(LogType::Force, "Failed to start GPU capture: {}", error->localizedDescription()->utf8String());
}
m_capturing = true;
}
void MetalRenderer::EndCapture()
{
auto captureManager = MTL::CaptureManager::sharedCaptureManager();
captureManager->stopCapture();
m_capturing = false;
}