void BufferCache::CommitAsyncFlushesHigh() {
AccumulateFlushes();
+
if (committed_ranges.empty()) {
return;
}
@@ -869,7 +890,7 @@ void BufferCache
::CommitAsyncFlushesHigh() {
buffer_id,
});
// Align up to avoid cache conflicts
- constexpr u64 align = 256ULL;
+ constexpr u64 align = 8ULL;
constexpr u64 mask = ~(align - 1ULL);
total_size_bytes += (new_size + align - 1) & mask;
largest_copy = std::max(largest_copy, new_size);
@@ -1041,6 +1062,19 @@ void BufferCache
::BindHostVertexBuffers() {
}
}
+template
+void BufferCache::BindHostDrawIndirectBuffers() {
+ const auto bind_buffer = [this](const Binding& binding) {
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ TouchBuffer(buffer, binding.buffer_id);
+ SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+ };
+ if (current_draw_indirect->include_count) {
+ bind_buffer(count_buffer_binding);
+ }
+ bind_buffer(indirect_buffer_binding);
+}
+
template
void BufferCache::BindHostGraphicsUniformBuffers(size_t stage) {
u32 dirty = ~0U;
@@ -1272,6 +1306,9 @@ void BufferCache
::DoUpdateGraphicsBuffers(bool is_indexed) {
UpdateStorageBuffers(stage);
UpdateTextureBuffers(stage);
}
+ if (current_draw_indirect) {
+ UpdateDrawIndirect();
+ }
} while (has_deleted_buffers);
}
@@ -1289,7 +1326,7 @@ void BufferCache
::UpdateIndexBuffer() {
const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
const auto& index_array = draw_state.index_buffer;
auto& flags = maxwell3d->dirty.flags;
- if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
+ if (!flags[Dirty::IndexBuffer]) {
return;
}
flags[Dirty::IndexBuffer] = false;
@@ -1361,6 +1398,27 @@ void BufferCache
::UpdateVertexBuffer(u32 index) {
};
}
+template
+void BufferCache::UpdateDrawIndirect() {
+ const auto update = [this](GPUVAddr gpu_addr, size_t size, Binding& binding) {
+ const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr) {
+ binding = NULL_BINDING;
+ return;
+ }
+ binding = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = static_cast(size),
+ .buffer_id = FindBuffer(*cpu_addr, static_cast(size)),
+ };
+ };
+ if (current_draw_indirect->include_count) {
+ update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding);
+ }
+ update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size,
+ indirect_buffer_binding);
+}
+
template
void BufferCache::UpdateUniformBuffers(size_t stage) {
ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
@@ -1880,14 +1938,21 @@ typename BufferCache
::Binding BufferCache
::StorageBufferBinding(GPUVAddr s
bool is_written) const {
const GPUVAddr gpu_addr = gpu_memory->Read(ssbo_addr);
const u32 size = gpu_memory->Read(ssbo_addr + 8);
- const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+ const u32 alignment = runtime.GetStorageBufferAlignment();
+
+ const GPUVAddr aligned_gpu_addr = Common::AlignDown(gpu_addr, alignment);
+ const u32 aligned_size =
+ Common::AlignUp(static_cast(gpu_addr - aligned_gpu_addr) + size, alignment);
+
+ const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(aligned_gpu_addr);
if (!cpu_addr || size == 0) {
return NULL_BINDING;
}
- const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE);
+
+ const VAddr cpu_end = Common::AlignUp(*cpu_addr + aligned_size, Core::Memory::YUZU_PAGESIZE);
const Binding binding{
.cpu_addr = *cpu_addr,
- .size = is_written ? size : static_cast(cpu_end - *cpu_addr),
+ .size = is_written ? aligned_size : static_cast(cpu_end - *cpu_addr),
.buffer_id = BufferId{},
};
return binding;
@@ -1941,4 +2006,16 @@ bool BufferCache::HasFastUniformBufferBound(size_t stage, u32 binding_index)
}
}
+template
+std::pair::Buffer*, u32> BufferCache::GetDrawIndirectCount() {
+ auto& buffer = slot_buffers[count_buffer_binding.buffer_id];
+ return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr));
+}
+
+template
+std::pair::Buffer*, u32> BufferCache::GetDrawIndirectBuffer() {
+ auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id];
+ return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr));
+}
+
} // namespace VideoCommon
diff --git a/src/video_core/cache_types.h b/src/video_core/cache_types.h
new file mode 100644
index 0000000..1a5db3c
--- /dev/null
+++ b/src/video_core/cache_types.h
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+enum class CacheType : u32 {
+ None = 0,
+ TextureCache = 1 << 0,
+ QueryCache = 1 << 1,
+ BufferCache = 1 << 2,
+ ShaderCache = 1 << 3,
+ NoTextureCache = QueryCache | BufferCache | ShaderCache,
+ NoBufferCache = TextureCache | QueryCache | ShaderCache,
+ NoQueryCache = TextureCache | BufferCache | ShaderCache,
+ All = TextureCache | QueryCache | BufferCache | ShaderCache,
+};
+DECLARE_ENUM_FLAG_OPERATORS(CacheType)
+
+} // namespace VideoCommon
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 322de26..5519298 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -61,7 +61,7 @@ bool DmaPusher::Step() {
} else {
const CommandListHeader command_list_header{
command_list.command_lists[dma_pushbuffer_subindex++]};
- const GPUVAddr dma_get = command_list_header.addr;
+ dma_state.dma_get = command_list_header.addr;
if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
// We've gone through the current list, remove it from the queue
@@ -75,12 +75,22 @@ bool DmaPusher::Step() {
// Push buffer non-empty, read a word
command_headers.resize_destructive(command_list_header.size);
- if (Settings::IsGPULevelHigh()) {
- memory_manager.ReadBlock(dma_get, command_headers.data(),
- command_list_header.size * sizeof(u32));
+ constexpr u32 MacroRegistersStart = 0xE00;
+ if (dma_state.method < MacroRegistersStart) {
+ if (Settings::IsGPULevelHigh()) {
+ memory_manager.ReadBlock(dma_state.dma_get, command_headers.data(),
+ command_list_header.size * sizeof(u32));
+ } else {
+ memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(),
+ command_list_header.size * sizeof(u32));
+ }
} else {
- memory_manager.ReadBlockUnsafe(dma_get, command_headers.data(),
- command_list_header.size * sizeof(u32));
+ const size_t copy_size = command_list_header.size * sizeof(u32);
+ if (subchannels[dma_state.subchannel]) {
+ subchannels[dma_state.subchannel]->current_dirty =
+ memory_manager.IsMemoryDirty(dma_state.dma_get, copy_size);
+ }
+ memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(), copy_size);
}
ProcessCommands(command_headers);
}
@@ -94,6 +104,7 @@ void DmaPusher::ProcessCommands(std::span commands) {
if (dma_state.method_count) {
// Data word of methods command
+ dma_state.dma_word_offset = static_cast(index * sizeof(u32));
if (dma_state.non_incrementing) {
const u32 max_write = static_cast(
std::min(index + dma_state.method_count, commands.size()) - index);
@@ -132,6 +143,8 @@ void DmaPusher::ProcessCommands(std::span commands) {
case SubmissionMode::Inline:
dma_state.method = command_header.method;
dma_state.subchannel = command_header.subchannel;
+ dma_state.dma_word_offset = static_cast(
+ -static_cast(dma_state.dma_get)); // negate to set address as 0
CallMethod(command_header.arg_count);
dma_state.non_incrementing = true;
dma_increment_once = false;
@@ -164,8 +177,14 @@ void DmaPusher::CallMethod(u32 argument) const {
dma_state.method_count,
});
} else {
- subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
- dma_state.is_last_call);
+ auto subchannel = subchannels[dma_state.subchannel];
+ if (!subchannel->execution_mask[dma_state.method]) [[likely]] {
+ subchannel->method_sink.emplace_back(dma_state.method, argument);
+ return;
+ }
+ subchannel->ConsumeSink();
+ subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
+ subchannel->CallMethod(dma_state.method, argument, dma_state.is_last_call);
}
}
@@ -174,8 +193,11 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
puller.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
dma_state.method_count);
} else {
- subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start,
- num_methods, dma_state.method_count);
+ auto subchannel = subchannels[dma_state.subchannel];
+ subchannel->ConsumeSink();
+ subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
+ subchannel->CallMultiMethod(dma_state.method, base_start, num_methods,
+ dma_state.method_count);
}
}
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 6f00de9..1cdb690 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -156,6 +156,8 @@ private:
u32 subchannel; ///< Current subchannel
u32 method_count; ///< Current method count
u32 length_pending; ///< Large NI command length pending
+ GPUVAddr dma_get; ///< Currently read segment
+ u64 dma_word_offset; ///< Current word ofset from address
bool non_incrementing; ///< Current command's NI flag
bool is_last_call;
};
diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp
index 3a78421..2437121 100644
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -91,6 +91,23 @@ void DrawManager::DrawIndex(PrimitiveTopology topology, u32 index_first, u32 ind
ProcessDraw(true, num_instances);
}
+void DrawManager::DrawArrayIndirect(PrimitiveTopology topology) {
+ draw_state.topology = topology;
+
+ ProcessDrawIndirect();
+}
+
+void DrawManager::DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first,
+ u32 index_count) {
+ const auto& regs{maxwell3d->regs};
+ draw_state.topology = topology;
+ draw_state.index_buffer = regs.index_buffer;
+ draw_state.index_buffer.first = index_first;
+ draw_state.index_buffer.count = index_count;
+
+ ProcessDrawIndirect();
+}
+
void DrawManager::SetInlineIndexBuffer(u32 index) {
draw_state.inline_index_draw_indexes.push_back(static_cast(index & 0x000000ff));
draw_state.inline_index_draw_indexes.push_back(static_cast((index & 0x0000ff00) >> 8));
@@ -198,4 +215,18 @@ void DrawManager::ProcessDraw(bool draw_indexed, u32 instance_count) {
maxwell3d->rasterizer->Draw(draw_indexed, instance_count);
}
}
+
+void DrawManager::ProcessDrawIndirect() {
+ LOG_TRACE(
+ HW_GPU,
+ "called, topology={}, is_indexed={}, includes_count={}, buffer_size={}, max_draw_count={}",
+ draw_state.topology, indirect_state.is_indexed, indirect_state.include_count,
+ indirect_state.buffer_size, indirect_state.max_draw_counts);
+
+ UpdateTopology();
+
+ if (maxwell3d->ShouldExecute()) {
+ maxwell3d->rasterizer->DrawIndirect();
+ }
+}
} // namespace Tegra::Engines
diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h
index 0e6930a..58d1b2d 100644
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -32,6 +32,16 @@ public:
std::vector inline_index_draw_indexes;
};
+ struct IndirectParams {
+ bool is_indexed;
+ bool include_count;
+ GPUVAddr count_start_address;
+ GPUVAddr indirect_start_address;
+ size_t buffer_size;
+ size_t max_draw_counts;
+ size_t stride;
+ };
+
explicit DrawManager(Maxwell3D* maxwell_3d);
void ProcessMethodCall(u32 method, u32 argument);
@@ -46,10 +56,22 @@ public:
void DrawIndex(PrimitiveTopology topology, u32 index_first, u32 index_count, u32 base_index,
u32 base_instance, u32 num_instances);
+ void DrawArrayIndirect(PrimitiveTopology topology);
+
+ void DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first, u32 index_count);
+
const State& GetDrawState() const {
return draw_state;
}
+ IndirectParams& GetIndirectParams() {
+ return indirect_state;
+ }
+
+ const IndirectParams& GetIndirectParams() const {
+ return indirect_state;
+ }
+
private:
void SetInlineIndexBuffer(u32 index);
@@ -63,7 +85,10 @@ private:
void ProcessDraw(bool draw_indexed, u32 instance_count);
+ void ProcessDrawIndirect();
+
Maxwell3D* maxwell3d{};
State draw_state{};
+ IndirectParams indirect_state{};
};
} // namespace Tegra::Engines
diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h
index 26cde85..3923223 100644
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -3,6 +3,10 @@
#pragma once
+#include
+#include
+#include
+
#include "common/common_types.h"
namespace Tegra::Engines {
@@ -17,6 +21,26 @@ public:
/// Write multiple values to the register identified by method.
virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
u32 methods_pending) = 0;
+
+ void ConsumeSink() {
+ if (method_sink.empty()) {
+ return;
+ }
+ ConsumeSinkImpl();
+ }
+
+ std::bitset::max()> execution_mask{};
+ std::vector> method_sink{};
+ bool current_dirty{};
+ GPUVAddr current_dma_segment;
+
+protected:
+ virtual void ConsumeSinkImpl() {
+ for (auto [method, value] : method_sink) {
+ CallMethod(method, value, true);
+ }
+ method_sink.clear();
+ }
};
} // namespace Tegra::Engines
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index cea1dd8..7f5a0c2 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -76,7 +76,7 @@ void State::ProcessData(std::span read_buffer) {
regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
x_elements, regs.line_count, regs.dest.BlockHeight(),
regs.dest.BlockDepth(), regs.line_length_in);
- memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+ memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
}
}
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index c6478ae..a126c35 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,6 +6,7 @@
#include "common/microprofile.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/surface.h"
#include "video_core/textures/decoders.h"
@@ -20,11 +21,14 @@ namespace Tegra::Engines {
using namespace Texture;
-Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
- sw_blitter = std::make_unique(memory_manager_);
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
+ sw_blitter = std::make_unique(memory_manager);
// Nvidia's OpenGL driver seems to assume these values
regs.src.depth = 1;
regs.dst.depth = 1;
+
+ execution_mask.reset();
+ execution_mask[FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1] = true;
}
Fermi2D::~Fermi2D() = default;
@@ -49,6 +53,13 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
}
}
+void Fermi2D::ConsumeSinkImpl() {
+ for (auto [method, value] : method_sink) {
+ regs.reg_array[method] = value;
+ }
+ method_sink.clear();
+}
+
void Fermi2D::Blit() {
MICROPROFILE_SCOPE(GPU_BlitEngine);
LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
@@ -94,6 +105,7 @@ void Fermi2D::Blit() {
config.src_x0 = 0;
}
+ memory_manager.FlushCaching();
if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
sw_blitter->Blit(src, regs.dst, config);
}
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 100b21b..705b323 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -305,10 +305,13 @@ public:
private:
VideoCore::RasterizerInterface* rasterizer = nullptr;
std::unique_ptr sw_blitter;
+ MemoryManager& memory_manager;
/// Performs the copy from the source surface to the destination surface as configured in the
/// registers.
void Blit();
+
+ void ConsumeSinkImpl() override;
};
#define ASSERT_REG_POSITION(field_name, position) \
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index e5c6221..601095f 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -14,7 +14,12 @@
namespace Tegra::Engines {
KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
- : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
+ : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {
+ execution_mask.reset();
+ execution_mask[KEPLER_COMPUTE_REG_INDEX(exec_upload)] = true;
+ execution_mask[KEPLER_COMPUTE_REG_INDEX(data_upload)] = true;
+ execution_mask[KEPLER_COMPUTE_REG_INDEX(launch)] = true;
+}
KeplerCompute::~KeplerCompute() = default;
@@ -23,6 +28,13 @@ void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_)
upload_state.BindRasterizer(rasterizer);
}
+void KeplerCompute::ConsumeSinkImpl() {
+ for (auto [method, value] : method_sink) {
+ regs.reg_array[method] = value;
+ }
+ method_sink.clear();
+}
+
void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid KeplerCompute register, increase the size of the Regs structure");
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index e154e3f..2092e68 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -204,6 +204,8 @@ public:
private:
void ProcessLaunch();
+ void ConsumeSinkImpl() override;
+
/// Retrieves information about a specific TIC entry from the TIC buffer.
Texture::TICEntry GetTICEntry(u32 tic_index) const;
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 08045d1..c026801 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -18,6 +18,17 @@ KeplerMemory::~KeplerMemory() = default;
void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
upload_state.BindRasterizer(rasterizer_);
+
+ execution_mask.reset();
+ execution_mask[KEPLERMEMORY_REG_INDEX(exec)] = true;
+ execution_mask[KEPLERMEMORY_REG_INDEX(data)] = true;
+}
+
+void KeplerMemory::ConsumeSinkImpl() {
+ for (auto [method, value] : method_sink) {
+ regs.reg_array[method] = value;
+ }
+ method_sink.clear();
}
void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 5fe7489..fb1eecb 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -73,6 +73,8 @@ public:
} regs{};
private:
+ void ConsumeSinkImpl() override;
+
Core::System& system;
Upload::State upload_state;
};
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 9b182b6..97f5477 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,6 +4,8 @@
#include
#include
#include "common/assert.h"
+#include "common/scope_exit.h"
+#include "common/settings.h"
#include "core/core.h"
#include "core/core_timing.h"
#include "video_core/dirty_flags.h"
@@ -28,6 +30,10 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
regs.upload} {
dirty.flags.flip();
InitializeRegisterDefaults();
+ execution_mask.reset();
+ for (size_t i = 0; i < execution_mask.size(); i++) {
+ execution_mask[i] = IsMethodExecutable(static_cast(i));
+ }
}
Maxwell3D::~Maxwell3D() = default;
@@ -121,6 +127,71 @@ void Maxwell3D::InitializeRegisterDefaults() {
shadow_state = regs;
}
+bool Maxwell3D::IsMethodExecutable(u32 method) {
+ if (method >= MacroRegistersStart) {
+ return true;
+ }
+ switch (method) {
+ case MAXWELL3D_REG_INDEX(draw.end):
+ case MAXWELL3D_REG_INDEX(draw.begin):
+ case MAXWELL3D_REG_INDEX(vertex_buffer.first):
+ case MAXWELL3D_REG_INDEX(vertex_buffer.count):
+ case MAXWELL3D_REG_INDEX(index_buffer.first):
+ case MAXWELL3D_REG_INDEX(index_buffer.count):
+ case MAXWELL3D_REG_INDEX(draw_inline_index):
+ case MAXWELL3D_REG_INDEX(index_buffer32_subsequent):
+ case MAXWELL3D_REG_INDEX(index_buffer16_subsequent):
+ case MAXWELL3D_REG_INDEX(index_buffer8_subsequent):
+ case MAXWELL3D_REG_INDEX(index_buffer32_first):
+ case MAXWELL3D_REG_INDEX(index_buffer16_first):
+ case MAXWELL3D_REG_INDEX(index_buffer8_first):
+ case MAXWELL3D_REG_INDEX(inline_index_2x16.even):
+ case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
+ case MAXWELL3D_REG_INDEX(vertex_array_instance_first):
+ case MAXWELL3D_REG_INDEX(vertex_array_instance_subsequent):
+ case MAXWELL3D_REG_INDEX(wait_for_idle):
+ case MAXWELL3D_REG_INDEX(shadow_ram_control):
+ case MAXWELL3D_REG_INDEX(load_mme.instruction_ptr):
+ case MAXWELL3D_REG_INDEX(load_mme.instruction):
+ case MAXWELL3D_REG_INDEX(load_mme.start_address):
+ case MAXWELL3D_REG_INDEX(falcon[4]):
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer):
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 1:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 2:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 3:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 4:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 5:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 6:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 7:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 8:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 9:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 10:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 11:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 12:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 13:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 14:
+ case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 15:
+ case MAXWELL3D_REG_INDEX(bind_groups[0].raw_config):
+ case MAXWELL3D_REG_INDEX(bind_groups[1].raw_config):
+ case MAXWELL3D_REG_INDEX(bind_groups[2].raw_config):
+ case MAXWELL3D_REG_INDEX(bind_groups[3].raw_config):
+ case MAXWELL3D_REG_INDEX(bind_groups[4].raw_config):
+ case MAXWELL3D_REG_INDEX(topology_override):
+ case MAXWELL3D_REG_INDEX(clear_surface):
+ case MAXWELL3D_REG_INDEX(report_semaphore.query):
+ case MAXWELL3D_REG_INDEX(render_enable.mode):
+ case MAXWELL3D_REG_INDEX(clear_report_value):
+ case MAXWELL3D_REG_INDEX(sync_info):
+ case MAXWELL3D_REG_INDEX(launch_dma):
+ case MAXWELL3D_REG_INDEX(inline_data):
+ case MAXWELL3D_REG_INDEX(fragment_barrier):
+ case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+ return true;
+ default:
+ return false;
+ }
+}
+
void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
if (executing_macro == 0) {
// A macro call must begin by writing the macro method's register, not its argument.
@@ -130,14 +201,72 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool
}
macro_params.insert(macro_params.end(), base_start, base_start + amount);
+ for (size_t i = 0; i < amount; i++) {
+ macro_addresses.push_back(current_dma_segment + i * sizeof(u32));
+ }
+ macro_segments.emplace_back(current_dma_segment, amount);
+ current_macro_dirty |= current_dirty;
+ current_dirty = false;
// Call the macro when there are no more parameters in the command buffer
if (is_last_call) {
+ ConsumeSink();
CallMacroMethod(executing_macro, macro_params);
macro_params.clear();
+ macro_addresses.clear();
+ macro_segments.clear();
+ current_macro_dirty = false;
}
}
+void Maxwell3D::RefreshParametersImpl() {
+ size_t current_index = 0;
+ for (auto& segment : macro_segments) {
+ if (segment.first == 0) {
+ current_index += segment.second;
+ continue;
+ }
+ memory_manager.ReadBlock(segment.first, ¯o_params[current_index],
+ sizeof(u32) * segment.second);
+ current_index += segment.second;
+ }
+}
+
+u32 Maxwell3D::GetMaxCurrentVertices() {
+ u32 num_vertices = 0;
+ for (size_t index = 0; index < Regs::NumVertexArrays; ++index) {
+ const auto& array = regs.vertex_streams[index];
+ if (array.enable == 0) {
+ continue;
+ }
+ const auto& attribute = regs.vertex_attrib_format[index];
+ if (attribute.constant) {
+ num_vertices = std::max(num_vertices, 1U);
+ continue;
+ }
+ const auto& limit = regs.vertex_stream_limits[index];
+ const GPUVAddr gpu_addr_begin = array.Address();
+ const GPUVAddr gpu_addr_end = limit.Address() + 1;
+ const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin);
+ num_vertices = std::max(
+ num_vertices, address_size / std::max(attribute.SizeInBytes(), array.stride.Value()));
+ }
+ return num_vertices;
+}
+
+size_t Maxwell3D::EstimateIndexBufferSize() {
+ GPUVAddr start_address = regs.index_buffer.StartAddress();
+ GPUVAddr end_address = regs.index_buffer.EndAddress();
+ constexpr std::array max_sizes = {
+ std::numeric_limits::max(), std::numeric_limits::max(),
+ std::numeric_limits::max(), std::numeric_limits::max()};
+ const size_t byte_size = regs.index_buffer.FormatSizeInBytes();
+ return std::min(
+ memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[byte_size]) /
+ byte_size,
+ static_cast(end_address - start_address));
+}
+
u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
// Keep track of the register value in shadow_state when requested.
const auto control = shadow_state.shadow_ram_control;
@@ -152,6 +281,29 @@ u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
return argument;
}
+void Maxwell3D::ConsumeSinkImpl() {
+ SCOPE_EXIT({ method_sink.clear(); });
+ const auto control = shadow_state.shadow_ram_control;
+ if (control == Regs::ShadowRamControl::Track ||
+ control == Regs::ShadowRamControl::TrackWithFilter) {
+
+ for (auto [method, value] : method_sink) {
+ shadow_state.reg_array[method] = value;
+ ProcessDirtyRegisters(method, value);
+ }
+ return;
+ }
+ if (control == Regs::ShadowRamControl::Replay) {
+ for (auto [method, value] : method_sink) {
+ ProcessDirtyRegisters(method, shadow_state.reg_array[method]);
+ }
+ return;
+ }
+ for (auto [method, value] : method_sink) {
+ ProcessDirtyRegisters(method, value);
+ }
+}
+
void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
if (regs.reg_array[method] == argument) {
return;
@@ -263,7 +415,6 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
const u32 argument = ProcessShadowRam(method, method_argument);
ProcessDirtyRegisters(method, argument);
-
ProcessMethodCall(method, argument, method_argument, is_last_call);
}
@@ -294,9 +445,11 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 15:
ProcessCBMultiData(base_start, amount);
break;
- case MAXWELL3D_REG_INDEX(inline_data):
+ case MAXWELL3D_REG_INDEX(inline_data): {
+ ASSERT(methods_pending == amount);
upload_state.ProcessData(base_start, amount);
return;
+ }
default:
for (u32 i = 0; i < amount; i++) {
CallMethod(method, base_start[i], methods_pending - i <= 1);
@@ -332,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
}
void Maxwell3D::ProcessQueryGet() {
- // TODO(Subv): Support the other query units.
- if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
- LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
- }
-
switch (regs.report_semaphore.query.operation) {
case Regs::ReportSemaphore::Operation::Release:
if (regs.report_semaphore.query.short_query != 0) {
@@ -389,7 +537,11 @@ void Maxwell3D::ProcessQueryCondition() {
case Regs::RenderEnable::Override::NeverRender:
execute_on = false;
break;
- case Regs::RenderEnable::Override::UseRenderEnable:
+ case Regs::RenderEnable::Override::UseRenderEnable: {
+ if (rasterizer->AccelerateConditionalRendering()) {
+ execute_on = true;
+ return;
+ }
switch (regs.render_enable.mode) {
case Regs::RenderEnable::Mode::True: {
execute_on = true;
@@ -427,6 +579,7 @@ void Maxwell3D::ProcessQueryCondition() {
}
break;
}
+ }
}
void Maxwell3D::ProcessCounterReset() {
@@ -463,7 +616,8 @@ std::optional Maxwell3D::GetQueryResult() {
}
void Maxwell3D::ProcessCBBind(size_t stage_index) {
- // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
+ // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader
+ // stage.
const auto& bind_data = regs.bind_groups[stage_index];
auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.shader_slot];
buffer.enabled = bind_data.valid.Value() != 0;
@@ -490,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {
const GPUVAddr address{buffer_address + regs.const_buffer.offset};
const size_t copy_size = amount * sizeof(u32);
- memory_manager.WriteBlock(address, start_base, copy_size);
+ memory_manager.WriteBlockCached(address, start_base, copy_size);
// Increment the current buffer position.
regs.const_buffer.offset += static_cast(copy_size);
@@ -524,4 +678,10 @@ u32 Maxwell3D::GetRegisterValue(u32 method) const {
return regs.reg_array[method];
}
+void Maxwell3D::SetHLEReplacementAttributeType(u32 bank, u32 offset,
+ HLEReplacementAttributeType name) {
+ const u64 key = (static_cast(bank) << 32) | offset;
+ replace_table.emplace(key, name);
+}
+
} // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 22b9043..0b2fd29 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -272,6 +272,7 @@ public:
};
union {
+ u32 raw;
BitField<0, 1, Mode> mode;
BitField<4, 8, u32> pad;
};
@@ -1217,10 +1218,12 @@ public:
struct Window {
union {
+ u32 raw_x;
BitField<0, 16, u32> x_min;
BitField<16, 16, u32> x_max;
};
union {
+ u32 raw_y;
BitField<0, 16, u32> y_min;
BitField<16, 16, u32> y_max;
};
@@ -2708,7 +2711,7 @@ public:
u32 post_z_pixel_imask; ///< 0x0F1C
INSERT_PADDING_BYTES_NOINIT(0x20);
ConstantColorRendering const_color_rendering; ///< 0x0F40
- s32 stencil_back_ref; ///< 0x0F54
+ u32 stencil_back_ref; ///< 0x0F54
u32 stencil_back_mask; ///< 0x0F58
u32 stencil_back_func_mask; ///< 0x0F5C
INSERT_PADDING_BYTES_NOINIT(0x14);
@@ -2832,9 +2835,9 @@ public:
Blend blend; ///< 0x133C
u32 stencil_enable; ///< 0x1380
StencilOp stencil_front_op; ///< 0x1384
- s32 stencil_front_ref; ///< 0x1394
- s32 stencil_front_func_mask; ///< 0x1398
- s32 stencil_front_mask; ///< 0x139C
+ u32 stencil_front_ref; ///< 0x1394
+ u32 stencil_front_func_mask; ///< 0x1398
+ u32 stencil_front_mask; ///< 0x139C
INSERT_PADDING_BYTES_NOINIT(0x4);
u32 draw_auto_start_byte_count; ///< 0x13A4
PsSaturate frag_color_clamp; ///< 0x13A8
@@ -3020,6 +3023,24 @@ public:
/// Store temporary hw register values, used by some calls to restore state after a operation
Regs shadow_state;
+ // None Engine
+ enum class EngineHint : u32 {
+ None = 0x0,
+ OnHLEMacro = 0x1,
+ };
+
+ EngineHint engine_state{EngineHint::None};
+
+ enum class HLEReplacementAttributeType : u32 {
+ BaseVertex = 0x0,
+ BaseInstance = 0x1,
+ DrawID = 0x2,
+ };
+
+ void SetHLEReplacementAttributeType(u32 bank, u32 offset, HLEReplacementAttributeType name);
+
+ std::unordered_map replace_table;
+
static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
static_assert(std::is_trivially_copyable_v, "Maxwell3D Regs must be trivially copyable");
@@ -3067,6 +3088,35 @@ public:
std::unique_ptr draw_manager;
friend class DrawManager;
+ GPUVAddr GetMacroAddress(size_t index) const {
+ return macro_addresses[index];
+ }
+
+ void RefreshParameters() {
+ if (!current_macro_dirty) {
+ return;
+ }
+ RefreshParametersImpl();
+ }
+
+ bool AnyParametersDirty() const {
+ return current_macro_dirty;
+ }
+
+ u32 GetMaxCurrentVertices();
+
+ size_t EstimateIndexBufferSize();
+
+ /// Handles a write to the CLEAR_BUFFERS register.
+ void ProcessClearBuffers(u32 layer_count);
+
+ /// Handles a write to the CB_BIND register.
+ void ProcessCBBind(size_t stage_index);
+
+ /// Handles a write to the CB_DATA[i] register.
+ void ProcessCBData(u32 value);
+ void ProcessCBMultiData(const u32* start_base, u32 amount);
+
private:
void InitializeRegisterDefaults();
@@ -3076,6 +3126,8 @@ private:
void ProcessDirtyRegisters(u32 method, u32 argument);
+ void ConsumeSinkImpl() override;
+
void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);
/// Retrieves information about a specific TIC entry from the TIC buffer.
@@ -3116,16 +3168,13 @@ private:
/// Handles writes to syncing register.
void ProcessSyncPoint();
- /// Handles a write to the CB_DATA[i] register.
- void ProcessCBData(u32 value);
- void ProcessCBMultiData(const u32* start_base, u32 amount);
-
- /// Handles a write to the CB_BIND register.
- void ProcessCBBind(size_t stage_index);
-
/// Returns a query's value or an empty object if the value will be deferred through a cache.
std::optional GetQueryResult();
+ void RefreshParametersImpl();
+
+ bool IsMethodExecutable(u32 method);
+
Core::System& system;
MemoryManager& memory_manager;
@@ -3145,6 +3194,10 @@ private:
Upload::State upload_state;
bool execute_on{true};
+
+ std::vector> macro_segments;
+ std::vector macro_addresses;
+ bool current_macro_dirty{};
};
#define ASSERT_REG_POSITION(field_name, position) \
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index f73d7bf..7762c7d 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -21,7 +21,10 @@ namespace Tegra::Engines {
using namespace Texture;
MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
- : system{system_}, memory_manager{memory_manager_} {}
+ : system{system_}, memory_manager{memory_manager_} {
+ execution_mask.reset();
+ execution_mask[offsetof(Regs, launch_dma) / sizeof(u32)] = true;
+}
MaxwellDMA::~MaxwellDMA() = default;
@@ -29,6 +32,13 @@ void MaxwellDMA::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
rasterizer = rasterizer_;
}
+void MaxwellDMA::ConsumeSinkImpl() {
+ for (auto [method, value] : method_sink) {
+ regs.reg_array[method] = value;
+ }
+ method_sink.clear();
+}
+
void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
@@ -59,7 +69,7 @@ void MaxwellDMA::Launch() {
if (launch.multi_line_enable) {
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+ memory_manager.FlushCaching();
if (!is_src_pitch && !is_dst_pitch) {
// If both the source and the destination are in block layout, assert.
CopyBlockLinearToBlockLinear();
@@ -94,6 +104,7 @@ void MaxwellDMA::Launch() {
reinterpret_cast(tmp_buffer.data()),
regs.line_length_in * sizeof(u32));
} else {
+ memory_manager.FlushCaching();
const auto convert_linear_2_blocklinear_addr = [](u64 address) {
return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
((address & 0x180) >> 1) | ((address & 0x20) << 3);
@@ -111,8 +122,8 @@ void MaxwellDMA::Launch() {
memory_manager.ReadBlockUnsafe(
convert_linear_2_blocklinear_addr(regs.offset_in + offset),
tmp_buffer.data(), tmp_buffer.size());
- memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
- tmp_buffer.size());
+ memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
+ tmp_buffer.size());
}
} else if (is_src_pitch && !is_dst_pitch) {
UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@@ -122,7 +133,7 @@ void MaxwellDMA::Launch() {
for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
tmp_buffer.size());
- memory_manager.WriteBlock(
+ memory_manager.WriteBlockCached(
convert_linear_2_blocklinear_addr(regs.offset_out + offset),
tmp_buffer.data(), tmp_buffer.size());
}
@@ -131,8 +142,8 @@ void MaxwellDMA::Launch() {
std::vector tmp_buffer(regs.line_length_in);
memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
regs.line_length_in);
- memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
- regs.line_length_in);
+ memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
+ regs.line_length_in);
}
}
}
@@ -194,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
regs.pitch_out);
- memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+ memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::CopyPitchToBlockLinear() {
@@ -246,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
regs.pitch_in);
- memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+ memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::FastCopyBlockLinearToPitch() {
@@ -277,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
regs.src_params.block_size.height, regs.src_params.block_size.depth,
regs.pitch_out);
- memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+ memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@@ -337,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
dst.block_size.height, dst.block_size.depth, pitch);
- memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+ memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::ReleaseSemaphore() {
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index c88191a..0e594fa 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -231,6 +231,8 @@ private:
void ReleaseSemaphore();
+ void ConsumeSinkImpl() override;
+
Core::System& system;
MemoryManager& memory_manager;
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index e6dc24f..f275b2a 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -47,6 +47,7 @@ set(SHADER_FILES
vulkan_present_scaleforce_fp16.frag
vulkan_present_scaleforce_fp32.frag
vulkan_quad_indexed.comp
+ vulkan_turbo_mode.comp
vulkan_uint8.comp
)
diff --git a/src/video_core/host_shaders/vulkan_turbo_mode.comp b/src/video_core/host_shaders/vulkan_turbo_mode.comp
new file mode 100644
index 0000000..d651001
--- /dev/null
+++ b/src/video_core/host_shaders/vulkan_turbo_mode.comp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 460 core
+
+layout (local_size_x = 16, local_size_y = 8, local_size_z = 1) in;
+
+layout (binding = 0) buffer ThreadData {
+ uint data[];
+};
+
+uint xorshift32(uint x) {
+ x ^= x << 13;
+ x ^= x >> 17;
+ x ^= x << 5;
+ return x;
+}
+
+uint getGlobalIndex() {
+ return gl_GlobalInvocationID.x + gl_GlobalInvocationID.y * gl_WorkGroupSize.y * gl_NumWorkGroups.y;
+}
+
+void main() {
+ uint myIndex = xorshift32(getGlobalIndex());
+ uint otherIndex = xorshift32(myIndex);
+
+ uint otherValue = atomicAdd(data[otherIndex % data.length()], 0) + 1;
+ atomicAdd(data[myIndex % data.length()], otherValue);
+}
diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h
new file mode 100644
index 0000000..2c2aaf7
--- /dev/null
+++ b/src/video_core/invalidation_accumulator.h
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include
+#include
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class InvalidationAccumulator {
+public:
+ InvalidationAccumulator() = default;
+ ~InvalidationAccumulator() = default;
+
+ void Add(GPUVAddr address, size_t size) {
+ const auto reset_values = [&]() {
+ if (has_collected) {
+ buffer.emplace_back(start_address, accumulated_size);
+ }
+ start_address = address;
+ accumulated_size = size;
+ last_collection = start_address + size;
+ };
+ if (address >= start_address && address + size <= last_collection) [[likely]] {
+ return;
+ }
+ size = ((address + size + atomicity_size_mask) & atomicity_mask) - address;
+ address = address & atomicity_mask;
+ if (!has_collected) [[unlikely]] {
+ reset_values();
+ has_collected = true;
+ return;
+ }
+ if (address != last_collection) [[unlikely]] {
+ reset_values();
+ return;
+ }
+ accumulated_size += size;
+ last_collection += size;
+ }
+
+ void Clear() {
+ buffer.clear();
+ start_address = 0;
+ last_collection = 0;
+ has_collected = false;
+ }
+
+ bool AnyAccumulated() const {
+ return has_collected;
+ }
+
+ template
+ void Callback(Func&& func) {
+ if (!has_collected) {
+ return;
+ }
+ buffer.emplace_back(start_address, accumulated_size);
+ for (auto& [address, size] : buffer) {
+ func(address, size);
+ }
+ }
+
+private:
+ static constexpr size_t atomicity_bits = 5;
+ static constexpr size_t atomicity_size = 1ULL << atomicity_bits;
+ static constexpr size_t atomicity_size_mask = atomicity_size - 1;
+ static constexpr size_t atomicity_mask = ~atomicity_size_mask;
+ GPUVAddr start_address{};
+ GPUVAddr last_collection{};
+ size_t accumulated_size{};
+ bool has_collected{};
+ std::vector> buffer;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index 505d81c..82ad047 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -12,7 +12,9 @@
#include "common/assert.h"
#include "common/fs/fs.h"
#include "common/fs/path_util.h"
+#include "common/microprofile.h"
#include "common/settings.h"
+#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro.h"
#include "video_core/macro/macro_hle.h"
#include "video_core/macro/macro_interpreter.h"
@@ -21,6 +23,8 @@
#include "video_core/macro/macro_jit_x64.h"
#endif
+MICROPROFILE_DEFINE(MacroHLE, "GPU", "Execute macro HLE", MP_RGB(128, 192, 192));
+
namespace Tegra {
static void Dump(u64 hash, std::span code) {
@@ -40,8 +44,8 @@ static void Dump(u64 hash, std::span code) {
macro_file.write(reinterpret_cast(code.data()), code.size_bytes());
}
-MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
- : hle_macros{std::make_unique(maxwell3d)} {}
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d_)
+ : hle_macros{std::make_unique(maxwell3d_)}, maxwell3d{maxwell3d_} {}
MacroEngine::~MacroEngine() = default;
@@ -59,8 +63,10 @@ void MacroEngine::Execute(u32 method, const std::vector& parameters) {
if (compiled_macro != macro_cache.end()) {
const auto& cache_info = compiled_macro->second;
if (cache_info.has_hle_program) {
+ MICROPROFILE_SCOPE(MacroHLE);
cache_info.hle_program->Execute(parameters, method);
} else {
+ maxwell3d.RefreshParameters();
cache_info.lle_program->Execute(parameters, method);
}
} else {
@@ -101,12 +107,15 @@ void MacroEngine::Execute(u32 method, const std::vector& parameters) {
}
}
- if (auto hle_program = hle_macros->GetHLEProgram(cache_info.hash)) {
+ auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+ if (!hle_program || Settings::values.disable_macro_hle) {
+ maxwell3d.RefreshParameters();
+ cache_info.lle_program->Execute(parameters, method);
+ } else {
cache_info.has_hle_program = true;
cache_info.hle_program = std::move(hle_program);
+ MICROPROFILE_SCOPE(MacroHLE);
cache_info.hle_program->Execute(parameters, method);
- } else {
- cache_info.lle_program->Execute(parameters, method);
}
}
}
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
index 07d97ba..737ced9 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -137,6 +137,7 @@ private:
std::unordered_map macro_cache;
std::unordered_map> uploaded_macro_code;
std::unique_ptr hle_macros;
+ Engines::Maxwell3D& maxwell3d;
};
std::unique_ptr GetMacroEngine(Engines::Maxwell3D& maxwell3d);
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 8549db2..6272a46 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -1,143 +1,551 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
#include
#include
+#include "common/assert.h"
#include "common/scope_exit.h"
#include "video_core/dirty_flags.h"
#include "video_core/engines/draw_manager.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro.h"
#include "video_core/macro/macro_hle.h"
+#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
namespace Tegra {
+
+using Maxwell3D = Engines::Maxwell3D;
+
namespace {
-using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector& parameters);
-
-// HLE'd functions
-void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) {
- const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
- maxwell3d.draw_manager->DrawIndex(
- static_cast(parameters[0] & 0x3ffffff),
- parameters[4], parameters[1], parameters[3], parameters[5], instance_count);
-}
-
-void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) {
- const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
- maxwell3d.draw_manager->DrawArray(
- static_cast(parameters[0]),
- parameters[3], parameters[1], parameters[4], instance_count);
-}
-
-void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) {
- const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
- const u32 element_base = parameters[4];
- const u32 base_instance = parameters[5];
- maxwell3d.regs.vertex_id_base = element_base;
- maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
- maxwell3d.CallMethod(0x8e3, 0x640, true);
- maxwell3d.CallMethod(0x8e4, element_base, true);
- maxwell3d.CallMethod(0x8e5, base_instance, true);
-
- maxwell3d.draw_manager->DrawIndex(
- static_cast(parameters[0]),
- parameters[3], parameters[1], element_base, base_instance, instance_count);
-
- maxwell3d.regs.vertex_id_base = 0x0;
- maxwell3d.CallMethod(0x8e3, 0x640, true);
- maxwell3d.CallMethod(0x8e4, 0x0, true);
- maxwell3d.CallMethod(0x8e5, 0x0, true);
-}
-
-// Multidraw Indirect
-void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) {
- SCOPE_EXIT({
- // Clean everything.
- maxwell3d.regs.vertex_id_base = 0x0;
- maxwell3d.CallMethod(0x8e3, 0x640, true);
- maxwell3d.CallMethod(0x8e4, 0x0, true);
- maxwell3d.CallMethod(0x8e5, 0x0, true);
- maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
- });
- const u32 start_indirect = parameters[0];
- const u32 end_indirect = parameters[1];
- if (start_indirect >= end_indirect) {
- // Nothing to do.
- return;
- }
- const u32 padding = parameters[3];
- const std::size_t max_draws = parameters[4];
-
- const u32 indirect_words = 5 + padding;
- const std::size_t first_draw = start_indirect;
- const std::size_t effective_draws = end_indirect - start_indirect;
- const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws);
-
- for (std::size_t index = first_draw; index < last_draw; index++) {
- const std::size_t base = index * indirect_words + 5;
- const u32 base_vertex = parameters[base + 3];
- const u32 base_instance = parameters[base + 4];
- maxwell3d.regs.vertex_id_base = base_vertex;
- maxwell3d.CallMethod(0x8e3, 0x640, true);
- maxwell3d.CallMethod(0x8e4, base_vertex, true);
- maxwell3d.CallMethod(0x8e5, base_instance, true);
- maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
- maxwell3d.draw_manager->DrawIndex(
- static_cast(parameters[2]),
- parameters[base + 2], parameters[base], base_vertex, base_instance,
- parameters[base + 1]);
+bool IsTopologySafe(Maxwell3D::Regs::PrimitiveTopology topology) {
+ switch (topology) {
+ case Maxwell3D::Regs::PrimitiveTopology::Points:
+ case Maxwell3D::Regs::PrimitiveTopology::Lines:
+ case Maxwell3D::Regs::PrimitiveTopology::LineLoop:
+ case Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+ case Maxwell3D::Regs::PrimitiveTopology::Triangles:
+ case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+ case Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+ case Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+ case Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+ case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+ case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+ case Maxwell3D::Regs::PrimitiveTopology::Patches:
+ return true;
+ case Maxwell3D::Regs::PrimitiveTopology::Quads:
+ case Maxwell3D::Regs::PrimitiveTopology::QuadStrip:
+ case Maxwell3D::Regs::PrimitiveTopology::Polygon:
+ default:
+ return false;
}
}
-// Multi-layer Clear
-void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) {
- ASSERT(parameters.size() == 1);
-
- const Engines::Maxwell3D::Regs::ClearSurface clear_params{parameters[0]};
- const u32 rt_index = clear_params.RT;
- const u32 num_layers = maxwell3d.regs.rt[rt_index].depth;
- ASSERT(clear_params.layer == 0);
-
- maxwell3d.regs.clear_surface.raw = clear_params.raw;
- maxwell3d.draw_manager->Clear(num_layers);
-}
-
-constexpr std::array, 5> hle_funcs{{
- {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
- {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
- {0x0217920100488FF7, &HLE_0217920100488FF7},
- {0x3F5E74B9C9A50164, &HLE_3F5E74B9C9A50164},
- {0xEAD26C3E2109B06B, &HLE_EAD26C3E2109B06B},
-}};
-
-class HLEMacroImpl final : public CachedMacro {
+class HLEMacroImpl : public CachedMacro {
public:
- explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
- : maxwell3d{maxwell3d_}, func{func_} {}
+ explicit HLEMacroImpl(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
- void Execute(const std::vector& parameters, u32 method) override {
- func(maxwell3d, parameters);
+protected:
+ Maxwell3D& maxwell3d;
+};
+
+/*
+ * @note: these macros have two versions, a normal and extended version, with the extended version
+ * also assigning the base vertex/instance.
+ */
+template
+class HLE_DrawArraysIndirect final : public HLEMacroImpl {
+public:
+ explicit HLE_DrawArraysIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+ void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override {
+ auto topology = static_cast(parameters[0]);
+ if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
+ Fallback(parameters);
+ return;
+ }
+
+ auto& params = maxwell3d.draw_manager->GetIndirectParams();
+ params.is_indexed = false;
+ params.include_count = false;
+ params.count_start_address = 0;
+ params.indirect_start_address = maxwell3d.GetMacroAddress(1);
+ params.buffer_size = 4 * sizeof(u32);
+ params.max_draw_counts = 1;
+ params.stride = 0;
+
+ if constexpr (extended) {
+ maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+ maxwell3d.SetHLEReplacementAttributeType(
+ 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+ }
+
+ maxwell3d.draw_manager->DrawArrayIndirect(topology);
+
+ if constexpr (extended) {
+ maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+ maxwell3d.replace_table.clear();
+ }
}
private:
- Engines::Maxwell3D& maxwell3d;
- HLEFunction func;
+ void Fallback(const std::vector& parameters) {
+ SCOPE_EXIT({
+ if (extended) {
+ maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+ maxwell3d.replace_table.clear();
+ }
+ });
+ maxwell3d.RefreshParameters();
+ const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+ auto topology = static_cast(parameters[0]);
+ const u32 vertex_first = parameters[3];
+ const u32 vertex_count = parameters[1];
+
+ if (!IsTopologySafe(topology) &&
+ static_cast(maxwell3d.GetMaxCurrentVertices()) <
+ static_cast(vertex_first) + static_cast(vertex_count)) {
+ ASSERT_MSG(false, "Faulty draw!");
+ return;
+ }
+
+ const u32 base_instance = parameters[4];
+ if constexpr (extended) {
+ maxwell3d.regs.global_base_instance_index = base_instance;
+ maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+ maxwell3d.SetHLEReplacementAttributeType(
+ 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+ }
+
+ maxwell3d.draw_manager->DrawArray(topology, vertex_first, vertex_count, base_instance,
+ instance_count);
+
+ if constexpr (extended) {
+ maxwell3d.regs.global_base_instance_index = 0;
+ maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+ maxwell3d.replace_table.clear();
+ }
+ }
+};
+
+/*
+ * @note: these macros have two versions, a normal and extended version, with the extended version
+ * also assigning the base vertex/instance.
+ */
+template
+class HLE_DrawIndexedIndirect final : public HLEMacroImpl {
+public:
+ explicit HLE_DrawIndexedIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+ void Execute(const std::vector& parameters, [[maybe_unused]] u32 method) override {
+ auto topology = static_cast(parameters[0]);
+ if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
+ Fallback(parameters);
+ return;
+ }
+
+ const u32 estimate = static_cast(maxwell3d.EstimateIndexBufferSize());
+ const u32 element_base = parameters[4];
+ const u32 base_instance = parameters[5];
+ maxwell3d.regs.vertex_id_base = element_base;
+ maxwell3d.regs.global_base_vertex_index = element_base;
+ maxwell3d.regs.global_base_instance_index = base_instance;
+ maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+ if constexpr (extended) {
+ maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+ maxwell3d.SetHLEReplacementAttributeType(
+ 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+ maxwell3d.SetHLEReplacementAttributeType(
+ 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+ }
+ auto& params = maxwell3d.draw_manager->GetIndirectParams();
+ params.is_indexed = true;
+ params.include_count = false;
+ params.count_start_address = 0;
+ params.indirect_start_address = maxwell3d.GetMacroAddress(1);
+ params.buffer_size = 5 * sizeof(u32);
+ params.max_draw_counts = 1;
+ params.stride = 0;
+ maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+ maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
+ maxwell3d.regs.vertex_id_base = 0x0;
+ maxwell3d.regs.global_base_vertex_index = 0x0;
+ maxwell3d.regs.global_base_instance_index = 0x0;
+ if constexpr (extended) {
+ maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+ maxwell3d.replace_table.clear();
+ }
+ }
+
+private:
+ void Fallback(const std::vector& parameters) {
+ maxwell3d.RefreshParameters();
+ const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+ const u32 element_base = parameters[4];
+ const u32 base_instance = parameters[5];
+ maxwell3d.regs.vertex_id_base = element_base;
+ maxwell3d.regs.global_base_vertex_index = element_base;
+ maxwell3d.regs.global_base_instance_index = base_instance;
+ maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+ if constexpr (extended) {
+ maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+ maxwell3d.SetHLEReplacementAttributeType(
+ 0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+ maxwell3d.SetHLEReplacementAttributeType(
+ 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+ }
+
+ maxwell3d.draw_manager->DrawIndex(
+ static_cast