Latte: Bound uniform buffers based on access patterns within the shader

This commit is contained in:
Exzap 2023-09-23 22:53:57 +02:00
parent 4d6b72b353
commit 3e925b7707
6 changed files with 114 additions and 93 deletions

View file

@ -132,22 +132,18 @@ void LatteBufferCache_syncGPUUniformBuffers(LatteDecompilerShader* shader, const
{
if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK)
{
// use full uniform buffers
for (sint32 t = 0; t < shader->uniformBufferListCount; t++)
for(const auto& buf : shader->list_quickBufferList)
{
sint32 i = shader->uniformBufferList[t];
sint32 i = buf.index;
MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0];
uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1;
if (physicalAddr == MPTR_NULL)
if (physicalAddr == MPTR_NULL) [[unlikely]]
{
// no data
g_renderer->buffer_bindUniformBuffer(shaderType, i, 0, 0);
continue;
}
uniformSize = std::min<uint32>(uniformSize, buf.size);
uint32 bindOffset = LatteBufferCache_retrieveDataInCache(physicalAddr, uniformSize);
g_renderer->buffer_bindUniformBuffer(shaderType, i, bindOffset, uniformSize);
}
}

View file

@ -1,6 +1,7 @@
#pragma once
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/RendererShader.h"
#include <boost/container/static_vector.hpp>
namespace LatteDecompiler
{
@ -158,11 +159,13 @@ struct LatteDecompilerShader
struct LatteFetchShader* compatibleFetchShader{};
// error tracking
bool hasError{false}; // if set, the shader cannot be used
// optimized access / iteration
// list of uniform buffers used
uint8 uniformBufferList[LATTE_NUM_MAX_UNIFORM_BUFFERS];
uint8 uniformBufferListCount{ 0 };
// list of used texture units (faster access than iterating textureUnitMask)
// compact resource lists for optimized access
struct QuickBufferEntry
{
uint8 index;
uint16 size;
};
boost::container::static_vector<QuickBufferEntry, LATTE_NUM_MAX_UNIFORM_BUFFERS> list_quickBufferList;
uint8 textureUnitList[LATTE_NUM_MAX_TEX_UNITS];
uint8 textureUnitListCount{ 0 };
// input

View file

@ -230,47 +230,39 @@ void LatteDecompiler_analyzeALUClause(LatteDecompilerShaderContext* shaderContex
// check input for uniform access
if( aluInstruction.sourceOperand[f].sel == 0xFFFFFFFF )
continue; // source operand not set/used
// about uniform register and buffer access tracking:
// for absolute indices we can determine a maximum size that is accessed
// relative accesses are tricky because the upper bound of accessed indices is unknown
// worst case we have to load the full file (256 * 16 byte entries) or for buffers an arbitrary upper bound (64KB in our case)
if( GPU7_ALU_SRC_IS_CFILE(aluInstruction.sourceOperand[f].sel) )
{
// uniform register access
// relative register file accesses are tricky because the range of possible indices is unknown
// worst case we have to load the full file (256 * 16 byte entries)
// by tracking the accessed base indices the shader analyzer can determine bounds for the potentially accessed ranges
shaderContext->analyzer.uniformRegisterAccess = true;
if (aluInstruction.sourceOperand[f].rel)
{
shaderContext->analyzer.uniformRegisterDynamicAccess = true;
shaderContext->analyzer.uniformRegisterAccessIndices.emplace_back(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true);
shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true);
}
else
{
_remapUniformAccess(shaderContext, true, 0, GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel));
shaderContext->analyzer.uniformRegisterAccessIndices.emplace_back(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false);
shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false);
}
}
else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction.sourceOperand[f].sel) )
{
// uniform bank 0 (uniform buffer with index cfInstruction->cBank0Index)
uint32 uniformBufferIndex = cfInstruction->cBank0Index;
if( uniformBufferIndex >= LATTE_NUM_MAX_UNIFORM_BUFFERS)
debugBreakpoint();
shaderContext->analyzer.uniformBufferAccessMask |= (1<<uniformBufferIndex);
if( aluInstruction.sourceOperand[f].rel )
shaderContext->analyzer.uniformBufferDynamicAccessMask |= (1<<uniformBufferIndex);
_remapUniformAccess(shaderContext, false, uniformBufferIndex, GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank0AddrBase);
cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS);
uint32 offset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank0AddrBase;
_remapUniformAccess(shaderContext, false, uniformBufferIndex, offset);
shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel);
}
else if( GPU7_ALU_SRC_IS_CBANK1(aluInstruction.sourceOperand[f].sel) )
{
// uniform bank 1 (uniform buffer with index cfInstruction->cBank1Index)
uint32 uniformBufferIndex = cfInstruction->cBank1Index;
if( uniformBufferIndex >= LATTE_NUM_MAX_UNIFORM_BUFFERS)
debugBreakpoint();
shaderContext->analyzer.uniformBufferAccessMask |= (1<<uniformBufferIndex);
if( aluInstruction.sourceOperand[f].rel )
shaderContext->analyzer.uniformBufferDynamicAccessMask |= (1<<uniformBufferIndex);
_remapUniformAccess(shaderContext, false, uniformBufferIndex, GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank1AddrBase);
cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS);
uint32 offset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank1AddrBase;
_remapUniformAccess(shaderContext, false, uniformBufferIndex, offset);
shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel);
}
else if( GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel) )
{
@ -360,8 +352,7 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex
if( texInstruction.textureFetch.textureIndex >= 0x80 && texInstruction.textureFetch.textureIndex <= 0x8F )
{
uint32 uniformBufferIndex = texInstruction.textureFetch.textureIndex - 0x80;
shaderContext->analyzer.uniformBufferAccessMask |= (1<<uniformBufferIndex);
shaderContext->analyzer.uniformBufferDynamicAccessMask |= (1<<uniformBufferIndex);
shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(0, true);
}
else if( texInstruction.textureFetch.textureIndex == 0x9F && shader->shaderType == LatteConst::ShaderType::Geometry )
{
@ -576,7 +567,7 @@ namespace LatteDecompiler
// for Vulkan we use consecutive indices
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0)
if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess())
continue;
sint32 uniformBindingPoint = i;
if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)
@ -592,7 +583,7 @@ namespace LatteDecompiler
// for OpenGL we use the relative buffer index
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0)
if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess())
continue;
sint32 uniformBindingPoint = i;
if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)
@ -765,17 +756,24 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD
LatteDecompiler_analyzeSubroutine(shaderContext, subroutineAddr);
}
// decide which uniform mode to use
if(shaderContext->analyzer.uniformBufferAccessMask != 0 && shaderContext->analyzer.uniformRegisterAccess )
debugBreakpoint(); // not allowed
if(shaderContext->analyzer.uniformBufferDynamicAccessMask != 0 )
bool hasAnyDynamicBufferAccess = false;
bool hasAnyBufferAccess = false;
for(auto& it : shaderContext->analyzer.uniformBufferAccessTracker)
{
if( it.HasRelativeAccess() )
hasAnyDynamicBufferAccess = true;
if( it.HasAccess() )
hasAnyBufferAccess = true;
}
if (hasAnyDynamicBufferAccess)
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK;
}
else if(shaderContext->analyzer.uniformRegisterDynamicAccess )
else if(shaderContext->analyzer.uniformRegisterAccessTracker.HasRelativeAccess() )
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE;
}
else if(shaderContext->analyzer.uniformBufferAccessMask != 0 || shaderContext->analyzer.uniformRegisterAccess != 0 )
else if(hasAnyBufferAccess || shaderContext->analyzer.uniformRegisterAccessTracker.HasAccess() )
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED;
}
@ -783,16 +781,18 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_NONE;
}
// generate list of uniform buffers based on uniformBufferAccessMask (for faster access)
shader->uniformBufferListCount = 0;
// generate compact list of uniform buffers (for faster access)
cemu_assert_debug(shader->list_quickBufferList.empty());
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if( !HAS_FLAG(shaderContext->analyzer.uniformBufferAccessMask, (1<<i)) )
if( !shaderContext->analyzer.uniformBufferAccessTracker[i].HasAccess() )
continue;
shader->uniformBufferList[shader->uniformBufferListCount] = i;
shader->uniformBufferListCount++;
LatteDecompilerShader::QuickBufferEntry entry;
entry.index = i;
entry.size = shaderContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE) * 16;
shader->list_quickBufferList.push_back(entry);
}
// get dimension of each used textures
// get dimension of each used texture
_LatteRegisterSetTextureUnit* texRegs = nullptr;
if( shader->shaderType == LatteConst::ShaderType::Vertex )
texRegs = shaderContext->contextRegistersNew->SQ_TEX_START_VS;

View file

@ -37,36 +37,14 @@ namespace LatteDecompiler
}
else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE)
{
// here we try to predict the accessed range so we dont have to upload the whole register file
// we assume that if there is a fixed-index access on an index higher than a relative access, it bounds the prior relative access
sint16 highestAccessIndex = -1;
bool highestAccessIndexIsRel = false;
for(auto& accessItr : decompilerContext->analyzer.uniformRegisterAccessIndices)
{
if (accessItr.index > highestAccessIndex || (accessItr.index == highestAccessIndex && accessItr.isRelative && !highestAccessIndexIsRel))
{
highestAccessIndex = accessItr.index;
highestAccessIndexIsRel = accessItr.isRelative;
}
}
if (highestAccessIndex < 0)
highestAccessIndex = 0;
uint32 cfileSize;
if (highestAccessIndexIsRel)
cfileSize = 256;
else
cfileSize = highestAccessIndex + 1;
// full uniform register file has to be present
uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(256);
// full or partial uniform register file has to be present
if (shaderType == LatteConst::ShaderType::Vertex)
shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize);
else if (shaderType == LatteConst::ShaderType::Pixel)
shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize);
else if (shaderType == LatteConst::ShaderType::Geometry)
shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize);
else
debugBreakpoint();
uniformOffsets.offset_uniformRegister = uniformCurrentOffset;
uniformOffsets.count_uniformRegister = cfileSize;
uniformCurrentOffset += 16 * cfileSize;
@ -168,7 +146,7 @@ namespace LatteDecompiler
{
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0)
if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess())
continue;
cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0);
@ -178,7 +156,7 @@ namespace LatteDecompiler
shaderSrc->addFmt("uniform {}{}" _CRLF, _getShaderUniformBlockInterfaceName(decompilerContext->shaderType), i);
shaderSrc->add("{" _CRLF);
shaderSrc->addFmt("vec4 {}{}[{}];" _CRLF, _getShaderUniformBlockVariableName(decompilerContext->shaderType), i, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE);
shaderSrc->addFmt("vec4 {}{}[{}];" _CRLF, _getShaderUniformBlockVariableName(decompilerContext->shaderType), i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE));
shaderSrc->add("};" _CRLF _CRLF);
shaderSrc->add(_CRLF);
}

View file

@ -125,19 +125,66 @@ struct LatteDecompilerCFInstruction
LatteDecompilerCFInstruction& operator=(LatteDecompilerCFInstruction&& mE) = default;
};
struct LatteDecompilerCFileAccess
{
LatteDecompilerCFileAccess(uint8 index, bool isRelative) : index(index), isRelative(isRelative) {};
uint8 index;
bool isRelative;
};
struct LatteDecompilerSubroutineInfo
{
uint32 cfAddr;
std::vector<LatteDecompilerCFInstruction> instructions;
};
// helper struct to track the highest accessed offset within a buffer
struct LatteDecompilerBufferAccessTracker
{
bool hasStaticIndexAccess{false};
bool hasDynamicIndexAccess{false};
sint32 highestAccessDynamicIndex{0};
sint32 highestAccessStaticIndex{0};
// track access, index is the array index and not a byte offset
void TrackAccess(sint32 index, bool isDynamicIndex)
{
if (isDynamicIndex)
{
hasDynamicIndexAccess = true;
if (index > highestAccessDynamicIndex)
highestAccessDynamicIndex = index;
}
else
{
hasStaticIndexAccess = true;
if (index > highestAccessStaticIndex)
highestAccessStaticIndex = index;
}
}
sint32 DetermineSize(sint32 maximumSize) const
{
// here we try to predict the accessed range so we dont have to upload the whole buffer
// potential risky optimization: assume that if there is a fixed-index access on an index higher than any other non-zero relative accesses, it bounds the prior relative access
sint32 highestAccessIndex = -1;
if(hasStaticIndexAccess)
{
highestAccessIndex = highestAccessStaticIndex;
}
if(hasDynamicIndexAccess)
{
return maximumSize; // dynamic index exists and no bound can be determined
}
if (highestAccessIndex < 0)
return 1; // no access at all? But avoid zero as a size
return highestAccessIndex + 1;
}
bool HasAccess() const
{
return hasStaticIndexAccess || hasDynamicIndexAccess;
}
bool HasRelativeAccess() const
{
return hasDynamicIndexAccess;
}
};
struct LatteDecompilerShaderContext
{
LatteDecompilerOutput_t* output;
@ -174,12 +221,9 @@ struct LatteDecompilerShaderContext
bool isPointsPrimitive{}; // set if current render primitive is points
bool outputPointSize{}; // set if the current shader should output the point size
std::bitset<256> inputAttributSemanticMask; // one set bit for every used semanticId - todo: there are only 128 bit available semantic locations? The MSB has special meaning?
// uniform
bool uniformRegisterAccess; // set to true if cfile (uniform register) is accessed
bool uniformRegisterDynamicAccess; // set to true if cfile (uniform register) is accessed with a dynamic index
uint32 uniformBufferAccessMask; // 1 bit per buffer, set if the uniform buffer is accessed
uint32 uniformBufferDynamicAccessMask; // 1 bit per buffer, set if the uniform buffer is accessed by dynamic index
std::vector<LatteDecompilerCFileAccess> uniformRegisterAccessIndices;
// uniforms
LatteDecompilerBufferAccessTracker uniformRegisterAccessTracker;
LatteDecompilerBufferAccessTracker uniformBufferAccessTracker[LATTE_NUM_MAX_UNIFORM_BUFFERS];
// ssbo
bool hasSSBORead; // shader has instructions that read from SSBO
bool hasSSBOWrite; // shader has instructions that write to SSBO

View file

@ -1591,10 +1591,9 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader
{
if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK)
{
// use full uniform buffers
for (sint32 t = 0; t < shader->uniformBufferListCount; t++)
for(const auto& buf : shader->list_quickBufferList)
{
sint32 i = shader->uniformBufferList[t];
sint32 i = buf.index;
MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0];
uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1;
@ -1603,6 +1602,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader
cemu_assert_unimplemented();
continue;
}
uniformSize = std::min<uint32>(uniformSize, buf.size);
cemu_assert_debug(physicalAddr < 0x50000000);
@ -1621,7 +1621,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader
dynamicOffsetInfo.shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_FRAGMENT].unformBufferOffset[bufferIndex] = physicalAddr - m_importedMemBaseAddress;
break;
default:
cemu_assert_debug(false);
UNREACHABLE;
}
}
}