From 7ffc1f0d2f08d06e180b88b8066cae326fa40905 Mon Sep 17 00:00:00 2001 From: Evan Husted Date: Sun, 5 Jan 2025 23:04:17 -0600 Subject: [PATCH] RDNA3 Vulkan project --- Ryujinx.sln | 6 + src/Ryujinx.Graphics.Rdna3Vulkan/Auto.cs | 191 + .../AutoFlushCounter.cs | 179 + .../BackgroundResources.cs | 120 + .../BarrierBatch.cs | 458 ++ src/Ryujinx.Graphics.Rdna3Vulkan/BitMap.cs | 157 + .../BitMapStruct.cs | 263 ++ .../BufferAllocationType.cs | 13 + .../BufferHolder.cs | 922 ++++ .../BufferManager.cs | 679 +++ .../BufferMirrorRangeList.cs | 305 ++ .../BufferState.cs | 56 + .../BufferUsageBitmap.cs | 82 + .../CacheByRange.cs | 394 ++ .../CommandBufferPool.cs | 370 ++ .../CommandBufferScoped.cs | 39 + src/Ryujinx.Graphics.Rdna3Vulkan/Constants.cs | 23 + .../DescriptorSetCollection.cs | 222 + .../DescriptorSetManager.cs | 231 + .../DescriptorSetTemplate.cs | 210 + .../DescriptorSetTemplateUpdater.cs | 77 + .../DescriptorSetUpdater.cs | 1190 +++++ .../DisposableBuffer.cs | 26 + .../DisposableBufferView.cs | 25 + .../DisposableFramebuffer.cs | 25 + .../DisposableImage.cs | 25 + .../DisposableImageView.cs | 25 + .../DisposableMemory.cs | 24 + .../DisposablePipeline.cs | 25 + .../DisposableRenderPass.cs | 25 + .../DisposableSampler.cs | 25 + .../Effects/AreaScalingFilter.cs | 101 + .../Effects/FsrScalingFilter.cs | 172 + .../Effects/FxaaPostProcessingEffect.cs | 88 + .../Effects/IPostProcessingEffect.cs | 10 + .../Effects/IScalingFilter.cs | 20 + .../Effects/Shaders/AreaScaling.glsl | 122 + .../Effects/Shaders/AreaScaling.spv | Bin 0 -> 12428 bytes .../Effects/Shaders/FsrScaling.glsl | 3945 +++++++++++++++++ .../Effects/Shaders/FsrScaling.spv | Bin 0 -> 44672 bytes .../Effects/Shaders/FsrSharpening.glsl | 3904 ++++++++++++++++ .../Effects/Shaders/FsrSharpening.spv | Bin 0 -> 20472 bytes .../Effects/Shaders/Fxaa.glsl | 1177 +++++ .../Effects/Shaders/Fxaa.spv | Bin 0 -> 25012 bytes .../Effects/Shaders/SmaaBlend.glsl | 1404 ++++++ .../Effects/Shaders/SmaaBlend.spv | Bin 0 -> 33728 bytes .../Effects/Shaders/SmaaEdge.glsl | 1402 ++++++ .../Effects/Shaders/SmaaEdge.spv | Bin 0 -> 8464 bytes .../Effects/Shaders/SmaaNeighbour.glsl | 1403 ++++++ .../Effects/Shaders/SmaaNeighbour.spv | Bin 0 -> 8328 bytes .../Effects/SmaaConstants.cs | 15 + .../Effects/SmaaPostProcessingEffect.cs | 266 ++ .../Effects/Textures/SmaaAreaTexture.bin | Bin 0 -> 179200 bytes .../Effects/Textures/SmaaSearchTexture.bin | Bin 0 -> 1024 bytes .../EnumConversion.cs | 452 ++ .../FeedbackLoopAspects.cs | 12 + .../FenceHelper.cs | 30 + .../FenceHolder.cs | 159 + .../FormatCapabilities.cs | 233 + .../FormatConverter.cs | 49 + .../FormatTable.cs | 358 ++ .../FramebufferParams.cs | 344 ++ .../HardwareCapabilities.cs | 138 + .../HashTableSlim.cs | 143 + .../HelperShader.cs | 1740 ++++++++ .../HostMemoryAllocator.cs | 189 + src/Ryujinx.Graphics.Rdna3Vulkan/IdList.cs | 121 + .../ImageArray.cs | 207 + .../IndexBufferPattern.cs | 139 + .../IndexBufferState.cs | 171 + .../MemoryAllocation.cs | 59 + .../MemoryAllocator.cs | 118 + .../MemoryAllocatorBlockList.cs | 310 ++ .../MultiFenceHolder.cs | 267 ++ .../NativeArray.cs | 48 + .../PersistentFlushBuffer.cs | 97 + .../PipelineBase.cs | 1810 ++++++++ .../PipelineConverter.cs | 336 ++ .../PipelineDynamicState.cs | 203 + .../PipelineFull.cs | 351 ++ .../PipelineHelperShader.cs | 54 + .../PipelineLayoutCache.cs | 107 + .../PipelineLayoutCacheEntry.cs | 383 ++ .../PipelineLayoutFactory.cs | 115 + .../PipelineState.cs | 732 +++ .../PipelineUid.cs | 125 + .../Queries/BufferedQuery.cs | 216 + .../Queries/CounterQueue.cs | 252 ++ .../Queries/CounterQueueEvent.cs | 170 + .../Queries/Counters.cs | 71 + .../RenderPassCacheKey.cs | 43 + .../RenderPassHolder.cs | 221 + .../ResourceArray.cs | 81 + .../ResourceBindingSegment.cs | 22 + .../ResourceLayoutBuilder.cs | 57 + .../Ryujinx.Graphics.Rdna3Vulkan.csproj | 28 + .../SamplerHolder.cs | 120 + src/Ryujinx.Graphics.Rdna3Vulkan/Shader.cs | 161 + .../ShaderCollection.cs | 767 ++++ src/Ryujinx.Graphics.Rdna3Vulkan/SpecInfo.cs | 100 + .../StagingBuffer.cs | 297 ++ .../SyncManager.cs | 215 + .../TextureArray.cs | 234 + .../TextureBuffer.cs | 130 + .../TextureCopy.cs | 473 ++ .../TextureStorage.cs | 618 +++ .../TextureView.cs | 1155 +++++ src/Ryujinx.Graphics.Rdna3Vulkan/Vendor.cs | 100 + .../VertexBufferState.cs | 139 + .../VertexBufferUpdater.cs | 82 + .../VulkanConfiguration.cs | 12 + .../VulkanDebugMessenger.cs | 133 + .../VulkanException.cs | 43 + .../VulkanInitialization.cs | 618 +++ .../VulkanInstance.cs | 127 + .../VulkanPhysicalDevice.cs | 97 + .../VulkanRenderer.cs | 1059 +++++ src/Ryujinx.Graphics.Rdna3Vulkan/Window.cs | 679 +++ .../WindowBase.cs | 20 + 119 files changed, 38581 insertions(+) create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Auto.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/AutoFlushCounter.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BackgroundResources.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BarrierBatch.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BitMap.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BitMapStruct.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BufferAllocationType.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BufferHolder.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BufferManager.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BufferMirrorRangeList.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BufferState.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/BufferUsageBitmap.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/CacheByRange.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferPool.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferScoped.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Constants.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetCollection.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetManager.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplate.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplateUpdater.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetUpdater.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBuffer.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBufferView.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableFramebuffer.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImage.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImageView.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableMemory.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposablePipeline.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableRenderPass.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/DisposableSampler.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/AreaScalingFilter.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FsrScalingFilter.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FxaaPostProcessingEffect.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IPostProcessingEffect.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IScalingFilter.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/Fxaa.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/Fxaa.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaBlend.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaBlend.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaEdge.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaEdge.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.glsl create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.spv create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/SmaaConstants.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/SmaaPostProcessingEffect.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaAreaTexture.bin create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaSearchTexture.bin create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/EnumConversion.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FeedbackLoopAspects.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FenceHelper.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FenceHolder.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FormatCapabilities.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FormatConverter.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FormatTable.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/FramebufferParams.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/HardwareCapabilities.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/HashTableSlim.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/HelperShader.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/HostMemoryAllocator.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/IdList.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/ImageArray.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferPattern.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferState.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocation.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocator.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocatorBlockList.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/MultiFenceHolder.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/NativeArray.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PersistentFlushBuffer.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineBase.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineConverter.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineDynamicState.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineFull.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineHelperShader.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCache.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCacheEntry.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutFactory.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineState.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/PipelineUid.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Queries/BufferedQuery.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueue.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueueEvent.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Queries/Counters.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassCacheKey.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassHolder.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/ResourceArray.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/ResourceBindingSegment.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/ResourceLayoutBuilder.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Ryujinx.Graphics.Rdna3Vulkan.csproj create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/SamplerHolder.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Shader.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/ShaderCollection.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/SpecInfo.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/StagingBuffer.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/SyncManager.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/TextureArray.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/TextureBuffer.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/TextureCopy.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/TextureStorage.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/TextureView.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Vendor.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferState.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferUpdater.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanConfiguration.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanDebugMessenger.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanException.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInitialization.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInstance.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanPhysicalDevice.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/VulkanRenderer.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/Window.cs create mode 100644 src/Ryujinx.Graphics.Rdna3Vulkan/WindowBase.cs diff --git a/Ryujinx.sln b/Ryujinx.sln index 9e197e85f..db312e771 100644 --- a/Ryujinx.sln +++ b/Ryujinx.sln @@ -95,6 +95,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution .github\workflows\release.yml = .github\workflows\release.yml EndProjectSection EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Ryujinx.Graphics.Rdna3Vulkan", "src\Ryujinx.Graphics.Rdna3Vulkan\Ryujinx.Graphics.Rdna3Vulkan.csproj", "{5D8C99F7-AC66-43CF-AE84-68ADA27CCED7}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -259,6 +261,10 @@ Global {81EA598C-DBA1-40B0-8DA4-4796B78F2037}.Debug|Any CPU.Build.0 = Debug|Any CPU {81EA598C-DBA1-40B0-8DA4-4796B78F2037}.Release|Any CPU.ActiveCfg = Release|Any CPU {81EA598C-DBA1-40B0-8DA4-4796B78F2037}.Release|Any CPU.Build.0 = Release|Any CPU + {5D8C99F7-AC66-43CF-AE84-68ADA27CCED7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {5D8C99F7-AC66-43CF-AE84-68ADA27CCED7}.Debug|Any CPU.Build.0 = Debug|Any CPU + {5D8C99F7-AC66-43CF-AE84-68ADA27CCED7}.Release|Any CPU.ActiveCfg = Release|Any CPU + {5D8C99F7-AC66-43CF-AE84-68ADA27CCED7}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Auto.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Auto.cs new file mode 100644 index 000000000..a3ab2cad8 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Auto.cs @@ -0,0 +1,191 @@ +using System; +using System.Diagnostics; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + interface IAuto + { + bool HasCommandBufferDependency(CommandBufferScoped cbs); + + void IncrementReferenceCount(); + void DecrementReferenceCount(int cbIndex); + void DecrementReferenceCount(); + } + + interface IAutoPrivate : IAuto + { + void AddCommandBufferDependencies(CommandBufferScoped cbs); + } + + interface IMirrorable where T : IDisposable + { + Auto GetMirrorable(CommandBufferScoped cbs, ref int offset, int size, out bool mirrored); + void ClearMirrors(CommandBufferScoped cbs, int offset, int size); + } + + class Auto : IAutoPrivate, IDisposable where T : IDisposable + { + private int _referenceCount; + private T _value; + + private readonly BitMap _cbOwnership; + private readonly MultiFenceHolder _waitable; + private readonly IAutoPrivate[] _referencedObjs; + private readonly IMirrorable _mirrorable; + + private bool _disposed; + private bool _destroyed; + + public Auto(T value) + { + _referenceCount = 1; + _value = value; + _cbOwnership = new BitMap(CommandBufferPool.MaxCommandBuffers); + } + + public Auto(T value, IMirrorable mirrorable, MultiFenceHolder waitable, params IAutoPrivate[] referencedObjs) : this(value, waitable, referencedObjs) + { + _mirrorable = mirrorable; + } + + public Auto(T value, MultiFenceHolder waitable, params IAutoPrivate[] referencedObjs) : this(value) + { + _waitable = waitable; + _referencedObjs = referencedObjs; + + for (int i = 0; i < referencedObjs.Length; i++) + { + referencedObjs[i].IncrementReferenceCount(); + } + } + + public T GetMirrorable(CommandBufferScoped cbs, ref int offset, int size, out bool mirrored) + { + var mirror = _mirrorable.GetMirrorable(cbs, ref offset, size, out mirrored); + mirror._waitable?.AddBufferUse(cbs.CommandBufferIndex, offset, size, false); + return mirror.Get(cbs); + } + + public T Get(CommandBufferScoped cbs, int offset, int size, bool write = false) + { + _mirrorable?.ClearMirrors(cbs, offset, size); + _waitable?.AddBufferUse(cbs.CommandBufferIndex, offset, size, write); + return Get(cbs); + } + + public T GetUnsafe() + { + return _value; + } + + public T Get(CommandBufferScoped cbs) + { + if (!_destroyed) + { + AddCommandBufferDependencies(cbs); + } + + return _value; + } + + public bool HasCommandBufferDependency(CommandBufferScoped cbs) + { + return _cbOwnership.IsSet(cbs.CommandBufferIndex); + } + + public bool HasRentedCommandBufferDependency(CommandBufferPool cbp) + { + return _cbOwnership.AnySet(); + } + + public void AddCommandBufferDependencies(CommandBufferScoped cbs) + { + // We don't want to add a reference to this object to the command buffer + // more than once, so if we detect that the command buffer already has ownership + // of this object, then we can just return without doing anything else. + if (_cbOwnership.Set(cbs.CommandBufferIndex)) + { + if (_waitable != null) + { + cbs.AddWaitable(_waitable); + } + + cbs.AddDependant(this); + + // We need to add a dependency on the command buffer to all objects this object + // references aswell. + if (_referencedObjs != null) + { + for (int i = 0; i < _referencedObjs.Length; i++) + { + _referencedObjs[i].AddCommandBufferDependencies(cbs); + } + } + } + } + + public bool TryIncrementReferenceCount() + { + int lastValue; + do + { + lastValue = _referenceCount; + + if (lastValue == 0) + { + return false; + } + } + while (Interlocked.CompareExchange(ref _referenceCount, lastValue + 1, lastValue) != lastValue); + + return true; + } + + public void IncrementReferenceCount() + { + if (Interlocked.Increment(ref _referenceCount) == 1) + { + Interlocked.Decrement(ref _referenceCount); + throw new InvalidOperationException("Attempted to increment the reference count of an object that was already destroyed."); + } + } + + public void DecrementReferenceCount(int cbIndex) + { + _cbOwnership.Clear(cbIndex); + DecrementReferenceCount(); + } + + public void DecrementReferenceCount() + { + if (Interlocked.Decrement(ref _referenceCount) == 0) + { + _value.Dispose(); + _value = default; + _destroyed = true; + + // Value is no longer in use by the GPU, dispose all other + // resources that it references. + if (_referencedObjs != null) + { + for (int i = 0; i < _referencedObjs.Length; i++) + { + _referencedObjs[i].DecrementReferenceCount(); + } + } + } + + Debug.Assert(_referenceCount >= 0); + } + + public void Dispose() + { + if (!_disposed) + { + DecrementReferenceCount(); + _disposed = true; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/AutoFlushCounter.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/AutoFlushCounter.cs new file mode 100644 index 000000000..9e9ec4b7b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/AutoFlushCounter.cs @@ -0,0 +1,179 @@ +using Ryujinx.Common.Logging; +using System; +using System.Diagnostics; +using System.Linq; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class AutoFlushCounter + { + // How often to flush on framebuffer change. + private readonly static long _framebufferFlushTimer = Stopwatch.Frequency / 1000; // (1ms) + + // How often to flush on draw when fast flush mode is enabled. + private readonly static long _drawFlushTimer = Stopwatch.Frequency / 666; // (1.5ms) + + // Average wait time that triggers fast flush mode to be entered. + private readonly static long _fastFlushEnterThreshold = Stopwatch.Frequency / 666; // (1.5ms) + + // Average wait time that triggers fast flush mode to be exited. + private readonly static long _fastFlushExitThreshold = Stopwatch.Frequency / 10000; // (0.1ms) + + // Number of frames to average waiting times over. + private const int SyncWaitAverageCount = 20; + + private const int MinDrawCountForFlush = 10; + private const int MinConsecutiveQueryForFlush = 10; + private const int InitialQueryCountForFlush = 32; + + private readonly VulkanRenderer _gd; + + private long _lastFlush; + private ulong _lastDrawCount; + private bool _hasPendingQuery; + private int _consecutiveQueries; + private int _queryCount; + + private readonly int[] _queryCountHistory = new int[3]; + private int _queryCountHistoryIndex; + private int _remainingQueries; + + private readonly long[] _syncWaitHistory = new long[SyncWaitAverageCount]; + private int _syncWaitHistoryIndex; + + private bool _fastFlushMode; + + public AutoFlushCounter(VulkanRenderer gd) + { + _gd = gd; + } + + public void RegisterFlush(ulong drawCount) + { + _lastFlush = Stopwatch.GetTimestamp(); + _lastDrawCount = drawCount; + + _hasPendingQuery = false; + _consecutiveQueries = 0; + } + + public bool RegisterPendingQuery() + { + _hasPendingQuery = true; + _consecutiveQueries++; + _remainingQueries--; + + _queryCountHistory[_queryCountHistoryIndex]++; + + // Interrupt render passes to flush queries, so that early results arrive sooner. + if (++_queryCount == InitialQueryCountForFlush) + { + return true; + } + + return false; + } + + public int GetRemainingQueries() + { + if (_remainingQueries <= 0) + { + _remainingQueries = 16; + } + + if (_queryCount < InitialQueryCountForFlush) + { + return Math.Min(InitialQueryCountForFlush - _queryCount, _remainingQueries); + } + + return _remainingQueries; + } + + public bool ShouldFlushQuery() + { + return _hasPendingQuery; + } + + public bool ShouldFlushDraw(ulong drawCount) + { + if (_fastFlushMode) + { + long draws = (long)(drawCount - _lastDrawCount); + + if (draws < MinDrawCountForFlush) + { + if (draws == 0) + { + _lastFlush = Stopwatch.GetTimestamp(); + } + + return false; + } + + long flushTimeout = _drawFlushTimer; + + long now = Stopwatch.GetTimestamp(); + + return now > _lastFlush + flushTimeout; + } + + return false; + } + + public bool ShouldFlushAttachmentChange(ulong drawCount) + { + _queryCount = 0; + + // Flush when there's an attachment change out of a large block of queries. + if (_consecutiveQueries > MinConsecutiveQueryForFlush) + { + return true; + } + + _consecutiveQueries = 0; + + long draws = (long)(drawCount - _lastDrawCount); + + if (draws < MinDrawCountForFlush) + { + if (draws == 0) + { + _lastFlush = Stopwatch.GetTimestamp(); + } + + return false; + } + + long flushTimeout = _framebufferFlushTimer; + + long now = Stopwatch.GetTimestamp(); + + return now > _lastFlush + flushTimeout; + } + + public void Present() + { + // Query flush prediction. + + _queryCountHistoryIndex = (_queryCountHistoryIndex + 1) % 3; + + _remainingQueries = _queryCountHistory.Max() + 10; + + _queryCountHistory[_queryCountHistoryIndex] = 0; + + // Fast flush mode toggle. + + _syncWaitHistory[_syncWaitHistoryIndex] = _gd.SyncManager.GetAndResetWaitTicks(); + + _syncWaitHistoryIndex = (_syncWaitHistoryIndex + 1) % SyncWaitAverageCount; + + long averageWait = (long)_syncWaitHistory.Average(); + + if (_fastFlushMode ? averageWait < _fastFlushExitThreshold : averageWait > _fastFlushEnterThreshold) + { + _fastFlushMode = !_fastFlushMode; + Logger.Debug?.PrintMsg(LogClass.Gpu, $"Switched fast flush mode: ({_fastFlushMode})"); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BackgroundResources.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BackgroundResources.cs new file mode 100644 index 000000000..6c9d479f1 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BackgroundResources.cs @@ -0,0 +1,120 @@ +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class BackgroundResource : IDisposable + { + private readonly VulkanRenderer _gd; + private Device _device; + + private CommandBufferPool _pool; + private PersistentFlushBuffer _flushBuffer; + + public BackgroundResource(VulkanRenderer gd, Device device) + { + _gd = gd; + _device = device; + } + + public CommandBufferPool GetPool() + { + if (_pool == null) + { + bool useBackground = _gd.BackgroundQueue.Handle != 0 && _gd.Vendor != Vendor.Amd; + Queue queue = useBackground ? _gd.BackgroundQueue : _gd.Queue; + Lock queueLock = useBackground ? _gd.BackgroundQueueLock : _gd.QueueLock; + + lock (queueLock) + { + _pool = new CommandBufferPool( + _gd.Api, + _device, + queue, + queueLock, + _gd.QueueFamilyIndex, + _gd.IsQualcommProprietary, + isLight: true); + } + } + + return _pool; + } + + public PersistentFlushBuffer GetFlushBuffer() + { + _flushBuffer ??= new PersistentFlushBuffer(_gd); + + return _flushBuffer; + } + + public void Dispose() + { + _pool?.Dispose(); + _flushBuffer?.Dispose(); + } + } + + class BackgroundResources : IDisposable + { + private readonly VulkanRenderer _gd; + private Device _device; + + private readonly Dictionary _resources; + + public BackgroundResources(VulkanRenderer gd, Device device) + { + _gd = gd; + _device = device; + + _resources = new Dictionary(); + } + + private void Cleanup() + { + lock (_resources) + { + foreach (KeyValuePair tuple in _resources) + { + if (!tuple.Key.IsAlive) + { + tuple.Value.Dispose(); + _resources.Remove(tuple.Key); + } + } + } + } + + public BackgroundResource Get() + { + Thread thread = Thread.CurrentThread; + + lock (_resources) + { + if (!_resources.TryGetValue(thread, out BackgroundResource resource)) + { + Cleanup(); + + resource = new BackgroundResource(_gd, _device); + + _resources[thread] = resource; + } + + return resource; + } + } + + public void Dispose() + { + lock (_resources) + { + foreach (var resource in _resources.Values) + { + resource.Dispose(); + } + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BarrierBatch.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BarrierBatch.cs new file mode 100644 index 000000000..058022232 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BarrierBatch.cs @@ -0,0 +1,458 @@ +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class BarrierBatch : IDisposable + { + private const int MaxBarriersPerCall = 16; + + private const AccessFlags BaseAccess = AccessFlags.ShaderReadBit | AccessFlags.ShaderWriteBit; + private const AccessFlags BufferAccess = AccessFlags.IndexReadBit | AccessFlags.VertexAttributeReadBit | AccessFlags.UniformReadBit; + private const AccessFlags CommandBufferAccess = AccessFlags.IndirectCommandReadBit; + + private readonly VulkanRenderer _gd; + + private readonly NativeArray _memoryBarrierBatch = new(MaxBarriersPerCall); + private readonly NativeArray _bufferBarrierBatch = new(MaxBarriersPerCall); + private readonly NativeArray _imageBarrierBatch = new(MaxBarriersPerCall); + + private readonly List> _memoryBarriers = new(); + private readonly List> _bufferBarriers = new(); + private readonly List> _imageBarriers = new(); + private int _queuedBarrierCount; + + private enum IncoherentBarrierType + { + None, + Texture, + All, + CommandBuffer + } + + private bool _feedbackLoopActive; + private PipelineStageFlags _incoherentBufferWriteStages; + private PipelineStageFlags _incoherentTextureWriteStages; + private PipelineStageFlags _extraStages; + private IncoherentBarrierType _queuedIncoherentBarrier; + private bool _queuedFeedbackLoopBarrier; + + public BarrierBatch(VulkanRenderer gd) + { + _gd = gd; + } + + public static (AccessFlags Access, PipelineStageFlags Stages) GetSubpassAccessSuperset(VulkanRenderer gd) + { + AccessFlags access = BufferAccess; + PipelineStageFlags stages = PipelineStageFlags.AllGraphicsBit; + + if (gd.TransformFeedbackApi != null) + { + access |= AccessFlags.TransformFeedbackWriteBitExt; + stages |= PipelineStageFlags.TransformFeedbackBitExt; + } + + return (access, stages); + } + + private readonly record struct StageFlags : IEquatable + { + public readonly PipelineStageFlags Source; + public readonly PipelineStageFlags Dest; + + public StageFlags(PipelineStageFlags source, PipelineStageFlags dest) + { + Source = source; + Dest = dest; + } + } + + private readonly struct BarrierWithStageFlags where T : unmanaged + { + public readonly StageFlags Flags; + public readonly T Barrier; + public readonly T2 Resource; + + public BarrierWithStageFlags(StageFlags flags, T barrier) + { + Flags = flags; + Barrier = barrier; + Resource = default; + } + + public BarrierWithStageFlags(PipelineStageFlags srcStageFlags, PipelineStageFlags dstStageFlags, T barrier, T2 resource) + { + Flags = new StageFlags(srcStageFlags, dstStageFlags); + Barrier = barrier; + Resource = resource; + } + } + + private void QueueBarrier(List> list, T barrier, T2 resource, PipelineStageFlags srcStageFlags, PipelineStageFlags dstStageFlags) where T : unmanaged + { + list.Add(new BarrierWithStageFlags(srcStageFlags, dstStageFlags, barrier, resource)); + _queuedBarrierCount++; + } + + public void QueueBarrier(MemoryBarrier barrier, PipelineStageFlags srcStageFlags, PipelineStageFlags dstStageFlags) + { + QueueBarrier(_memoryBarriers, barrier, default, srcStageFlags, dstStageFlags); + } + + public void QueueBarrier(BufferMemoryBarrier barrier, PipelineStageFlags srcStageFlags, PipelineStageFlags dstStageFlags) + { + QueueBarrier(_bufferBarriers, barrier, default, srcStageFlags, dstStageFlags); + } + + public void QueueBarrier(ImageMemoryBarrier barrier, TextureStorage resource, PipelineStageFlags srcStageFlags, PipelineStageFlags dstStageFlags) + { + QueueBarrier(_imageBarriers, barrier, resource, srcStageFlags, dstStageFlags); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe void FlushMemoryBarrier(ShaderCollection program, bool inRenderPass) + { + if (_queuedIncoherentBarrier > IncoherentBarrierType.None) + { + // We should emit a memory barrier if there's a write access in the program (current program, or program since last barrier) + bool hasTextureWrite = _incoherentTextureWriteStages != PipelineStageFlags.None; + bool hasBufferWrite = _incoherentBufferWriteStages != PipelineStageFlags.None; + bool hasBufferBarrier = _queuedIncoherentBarrier > IncoherentBarrierType.Texture; + + if (hasTextureWrite || (hasBufferBarrier && hasBufferWrite)) + { + AccessFlags access = BaseAccess; + + PipelineStageFlags stages = inRenderPass ? PipelineStageFlags.AllGraphicsBit : PipelineStageFlags.AllCommandsBit; + + if (hasBufferBarrier && hasBufferWrite) + { + access |= BufferAccess; + + if (_gd.TransformFeedbackApi != null) + { + access |= AccessFlags.TransformFeedbackWriteBitExt; + stages |= PipelineStageFlags.TransformFeedbackBitExt; + } + } + + if (_queuedIncoherentBarrier == IncoherentBarrierType.CommandBuffer) + { + access |= CommandBufferAccess; + stages |= PipelineStageFlags.DrawIndirectBit; + } + + MemoryBarrier barrier = new MemoryBarrier() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = access, + DstAccessMask = access + }; + + QueueBarrier(barrier, stages, stages); + + _incoherentTextureWriteStages = program?.IncoherentTextureWriteStages ?? PipelineStageFlags.None; + + if (_queuedIncoherentBarrier > IncoherentBarrierType.Texture) + { + if (program != null) + { + _incoherentBufferWriteStages = program.IncoherentBufferWriteStages | _extraStages; + } + else + { + _incoherentBufferWriteStages = PipelineStageFlags.None; + } + } + + _queuedIncoherentBarrier = IncoherentBarrierType.None; + _queuedFeedbackLoopBarrier = false; + } + else if (_feedbackLoopActive && _queuedFeedbackLoopBarrier) + { + // Feedback loop barrier. + + MemoryBarrier barrier = new MemoryBarrier() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = AccessFlags.ShaderWriteBit, + DstAccessMask = AccessFlags.ShaderReadBit + }; + + QueueBarrier(barrier, PipelineStageFlags.FragmentShaderBit, PipelineStageFlags.AllGraphicsBit); + + _queuedFeedbackLoopBarrier = false; + } + + _feedbackLoopActive = false; + } + } + + public unsafe void Flush(CommandBufferScoped cbs, bool inRenderPass, RenderPassHolder rpHolder, Action endRenderPass) + { + Flush(cbs, null, false, inRenderPass, rpHolder, endRenderPass); + } + + public unsafe void Flush(CommandBufferScoped cbs, ShaderCollection program, bool feedbackLoopActive, bool inRenderPass, RenderPassHolder rpHolder, Action endRenderPass) + { + if (program != null) + { + _incoherentBufferWriteStages |= program.IncoherentBufferWriteStages | _extraStages; + _incoherentTextureWriteStages |= program.IncoherentTextureWriteStages; + } + + _feedbackLoopActive |= feedbackLoopActive; + + FlushMemoryBarrier(program, inRenderPass); + + if (!inRenderPass && rpHolder != null) + { + // Render pass is about to begin. Queue any fences that normally interrupt the pass. + rpHolder.InsertForcedFences(cbs); + } + + while (_queuedBarrierCount > 0) + { + int memoryCount = 0; + int bufferCount = 0; + int imageCount = 0; + + bool hasBarrier = false; + StageFlags flags = default; + + static void AddBarriers( + Span target, + ref int queuedBarrierCount, + ref bool hasBarrier, + ref StageFlags flags, + ref int count, + List> list) where T : unmanaged + { + int firstMatch = -1; + int end = list.Count; + + for (int i = 0; i < list.Count; i++) + { + BarrierWithStageFlags barrier = list[i]; + + if (!hasBarrier) + { + flags = barrier.Flags; + hasBarrier = true; + + target[count++] = barrier.Barrier; + queuedBarrierCount--; + firstMatch = i; + + if (count >= target.Length) + { + end = i + 1; + break; + } + } + else + { + if (flags.Equals(barrier.Flags)) + { + target[count++] = barrier.Barrier; + queuedBarrierCount--; + + if (firstMatch == -1) + { + firstMatch = i; + } + + if (count >= target.Length) + { + end = i + 1; + break; + } + } + else + { + // Delete consumed barriers from the first match to the current non-match. + if (firstMatch != -1) + { + int deleteCount = i - firstMatch; + list.RemoveRange(firstMatch, deleteCount); + i -= deleteCount; + + firstMatch = -1; + end = list.Count; + } + } + } + } + + if (firstMatch == 0 && end == list.Count) + { + list.Clear(); + } + else if (firstMatch != -1) + { + int deleteCount = end - firstMatch; + + list.RemoveRange(firstMatch, deleteCount); + } + } + + if (inRenderPass && _imageBarriers.Count > 0) + { + // Image barriers queued in the batch are meant to be globally scoped, + // but inside a render pass they're scoped to just the range of the render pass. + + // On MoltenVK, we just break the rules and always use image barrier. + // On desktop GPUs, all barriers are globally scoped, so we just replace it with a generic memory barrier. + // Generally, we want to avoid this from happening in the future, so flag the texture to immediately + // emit a barrier whenever the current render pass is bound again. + + bool anyIsNonAttachment = false; + + foreach (BarrierWithStageFlags barrier in _imageBarriers) + { + // If the binding is an attachment, don't add it as a forced fence. + bool isAttachment = rpHolder.ContainsAttachment(barrier.Resource); + + if (!isAttachment) + { + rpHolder.AddForcedFence(barrier.Resource, barrier.Flags.Dest); + anyIsNonAttachment = true; + } + } + + if (_gd.IsTBDR) + { + if (!_gd.IsMoltenVk) + { + if (!anyIsNonAttachment) + { + // This case is a feedback loop. To prevent this from causing an absolute performance disaster, + // remove the barriers entirely. + // If this is not here, there will be a lot of single draw render passes. + // TODO: explicit handling for feedback loops, likely outside this class. + + _queuedBarrierCount -= _imageBarriers.Count; + _imageBarriers.Clear(); + } + else + { + // TBDR GPUs are sensitive to barriers, so we need to end the pass to ensure the data is available. + // Metal already has hazard tracking so MVK doesn't need this. + endRenderPass(); + inRenderPass = false; + } + } + } + else + { + // Generic pipeline memory barriers will work for desktop GPUs. + // They do require a few more access flags on the subpass dependency, though. + foreach (var barrier in _imageBarriers) + { + _memoryBarriers.Add(new BarrierWithStageFlags( + barrier.Flags, + new MemoryBarrier() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = barrier.Barrier.SrcAccessMask, + DstAccessMask = barrier.Barrier.DstAccessMask + })); + } + + _imageBarriers.Clear(); + } + } + + if (inRenderPass && _memoryBarriers.Count > 0) + { + PipelineStageFlags allFlags = PipelineStageFlags.None; + + foreach (var barrier in _memoryBarriers) + { + allFlags |= barrier.Flags.Dest; + } + + if (allFlags.HasFlag(PipelineStageFlags.DrawIndirectBit) || !_gd.SupportsRenderPassBarrier(allFlags)) + { + endRenderPass(); + inRenderPass = false; + } + } + + AddBarriers(_memoryBarrierBatch.AsSpan(), ref _queuedBarrierCount, ref hasBarrier, ref flags, ref memoryCount, _memoryBarriers); + AddBarriers(_bufferBarrierBatch.AsSpan(), ref _queuedBarrierCount, ref hasBarrier, ref flags, ref bufferCount, _bufferBarriers); + AddBarriers(_imageBarrierBatch.AsSpan(), ref _queuedBarrierCount, ref hasBarrier, ref flags, ref imageCount, _imageBarriers); + + if (hasBarrier) + { + PipelineStageFlags srcStageFlags = flags.Source; + + if (inRenderPass) + { + // Inside a render pass, barrier stages can only be from rasterization. + srcStageFlags &= ~PipelineStageFlags.ComputeShaderBit; + } + + _gd.Api.CmdPipelineBarrier( + cbs.CommandBuffer, + srcStageFlags, + flags.Dest, + 0, + (uint)memoryCount, + _memoryBarrierBatch.Pointer, + (uint)bufferCount, + _bufferBarrierBatch.Pointer, + (uint)imageCount, + _imageBarrierBatch.Pointer); + } + } + } + + private void QueueIncoherentBarrier(IncoherentBarrierType type) + { + if (type > _queuedIncoherentBarrier) + { + _queuedIncoherentBarrier = type; + } + + _queuedFeedbackLoopBarrier = true; + } + + public void QueueTextureBarrier() + { + QueueIncoherentBarrier(IncoherentBarrierType.Texture); + } + + public void QueueMemoryBarrier() + { + QueueIncoherentBarrier(IncoherentBarrierType.All); + } + + public void QueueCommandBufferBarrier() + { + QueueIncoherentBarrier(IncoherentBarrierType.CommandBuffer); + } + + public void EnableTfbBarriers(bool enable) + { + if (enable) + { + _extraStages |= PipelineStageFlags.TransformFeedbackBitExt; + } + else + { + _extraStages &= ~PipelineStageFlags.TransformFeedbackBitExt; + } + } + + public void Dispose() + { + _memoryBarrierBatch.Dispose(); + _bufferBarrierBatch.Dispose(); + _imageBarrierBatch.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BitMap.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BitMap.cs new file mode 100644 index 000000000..384155cae --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BitMap.cs @@ -0,0 +1,157 @@ +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct BitMap + { + public const int IntSize = 64; + + private const int IntShift = 6; + private const int IntMask = IntSize - 1; + + private readonly long[] _masks; + + public BitMap(int count) + { + _masks = new long[(count + IntMask) / IntSize]; + } + + public bool AnySet() + { + for (int i = 0; i < _masks.Length; i++) + { + if (_masks[i] != 0) + { + return true; + } + } + + return false; + } + + public bool IsSet(int bit) + { + int wordIndex = bit >> IntShift; + int wordBit = bit & IntMask; + + long wordMask = 1L << wordBit; + + return (_masks[wordIndex] & wordMask) != 0; + } + + public bool IsSet(int start, int end) + { + if (start == end) + { + return IsSet(start); + } + + int startIndex = start >> IntShift; + int startBit = start & IntMask; + long startMask = -1L << startBit; + + int endIndex = end >> IntShift; + int endBit = end & IntMask; + long endMask = (long)(ulong.MaxValue >> (IntMask - endBit)); + + if (startIndex == endIndex) + { + return (_masks[startIndex] & startMask & endMask) != 0; + } + + if ((_masks[startIndex] & startMask) != 0) + { + return true; + } + + for (int i = startIndex + 1; i < endIndex; i++) + { + if (_masks[i] != 0) + { + return true; + } + } + + if ((_masks[endIndex] & endMask) != 0) + { + return true; + } + + return false; + } + + public bool Set(int bit) + { + int wordIndex = bit >> IntShift; + int wordBit = bit & IntMask; + + long wordMask = 1L << wordBit; + + if ((_masks[wordIndex] & wordMask) != 0) + { + return false; + } + + _masks[wordIndex] |= wordMask; + + return true; + } + + public void SetRange(int start, int end) + { + if (start == end) + { + Set(start); + return; + } + + int startIndex = start >> IntShift; + int startBit = start & IntMask; + long startMask = -1L << startBit; + + int endIndex = end >> IntShift; + int endBit = end & IntMask; + long endMask = (long)(ulong.MaxValue >> (IntMask - endBit)); + + if (startIndex == endIndex) + { + _masks[startIndex] |= startMask & endMask; + } + else + { + _masks[startIndex] |= startMask; + + for (int i = startIndex + 1; i < endIndex; i++) + { + _masks[i] |= -1; + } + + _masks[endIndex] |= endMask; + } + } + + public void Clear(int bit) + { + int wordIndex = bit >> IntShift; + int wordBit = bit & IntMask; + + long wordMask = 1L << wordBit; + + _masks[wordIndex] &= ~wordMask; + } + + public void Clear() + { + for (int i = 0; i < _masks.Length; i++) + { + _masks[i] = 0; + } + } + + public void ClearInt(int start, int end) + { + for (int i = start; i <= end; i++) + { + _masks[i] = 0; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BitMapStruct.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BitMapStruct.cs new file mode 100644 index 000000000..453b581f9 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BitMapStruct.cs @@ -0,0 +1,263 @@ +using Ryujinx.Common.Memory; +using System; +using System.Numerics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + interface IBitMapListener + { + void BitMapSignal(int index, int count); + } + + struct BitMapStruct where T : IArray + { + public const int IntSize = 64; + + private const int IntShift = 6; + private const int IntMask = IntSize - 1; + + private T _masks; + + public BitMapStruct() + { + _masks = default; + } + + public bool BecomesUnsetFrom(in BitMapStruct from, ref BitMapStruct into) + { + bool result = false; + + int masks = _masks.Length; + for (int i = 0; i < masks; i++) + { + long fromMask = from._masks[i]; + long unsetMask = (~fromMask) & (fromMask ^ _masks[i]); + into._masks[i] = unsetMask; + + result |= unsetMask != 0; + } + + return result; + } + + public void SetAndSignalUnset(in BitMapStruct from, ref T2 listener) where T2 : struct, IBitMapListener + { + BitMapStruct result = new(); + + if (BecomesUnsetFrom(from, ref result)) + { + // Iterate the set bits in the result, and signal them. + + int offset = 0; + int masks = _masks.Length; + ref T resultMasks = ref result._masks; + for (int i = 0; i < masks; i++) + { + long value = resultMasks[i]; + while (value != 0) + { + int bit = BitOperations.TrailingZeroCount((ulong)value); + + listener.BitMapSignal(offset + bit, 1); + + value &= ~(1L << bit); + } + + offset += IntSize; + } + } + + _masks = from._masks; + } + + public void SignalSet(Action action) + { + // Iterate the set bits in the result, and signal them. + + int offset = 0; + int masks = _masks.Length; + for (int i = 0; i < masks; i++) + { + long value = _masks[i]; + while (value != 0) + { + int bit = BitOperations.TrailingZeroCount((ulong)value); + + action(offset + bit, 1); + + value &= ~(1L << bit); + } + + offset += IntSize; + } + } + + public bool AnySet() + { + for (int i = 0; i < _masks.Length; i++) + { + if (_masks[i] != 0) + { + return true; + } + } + + return false; + } + + public bool IsSet(int bit) + { + int wordIndex = bit >> IntShift; + int wordBit = bit & IntMask; + + long wordMask = 1L << wordBit; + + return (_masks[wordIndex] & wordMask) != 0; + } + + public bool IsSet(int start, int end) + { + if (start == end) + { + return IsSet(start); + } + + int startIndex = start >> IntShift; + int startBit = start & IntMask; + long startMask = -1L << startBit; + + int endIndex = end >> IntShift; + int endBit = end & IntMask; + long endMask = (long)(ulong.MaxValue >> (IntMask - endBit)); + + if (startIndex == endIndex) + { + return (_masks[startIndex] & startMask & endMask) != 0; + } + + if ((_masks[startIndex] & startMask) != 0) + { + return true; + } + + for (int i = startIndex + 1; i < endIndex; i++) + { + if (_masks[i] != 0) + { + return true; + } + } + + if ((_masks[endIndex] & endMask) != 0) + { + return true; + } + + return false; + } + + public bool Set(int bit) + { + int wordIndex = bit >> IntShift; + int wordBit = bit & IntMask; + + long wordMask = 1L << wordBit; + + if ((_masks[wordIndex] & wordMask) != 0) + { + return false; + } + + _masks[wordIndex] |= wordMask; + + return true; + } + + public void Set(int bit, bool value) + { + if (value) + { + Set(bit); + } + else + { + Clear(bit); + } + } + + public void SetRange(int start, int end) + { + if (start == end) + { + Set(start); + return; + } + + int startIndex = start >> IntShift; + int startBit = start & IntMask; + long startMask = -1L << startBit; + + int endIndex = end >> IntShift; + int endBit = end & IntMask; + long endMask = (long)(ulong.MaxValue >> (IntMask - endBit)); + + if (startIndex == endIndex) + { + _masks[startIndex] |= startMask & endMask; + } + else + { + _masks[startIndex] |= startMask; + + for (int i = startIndex + 1; i < endIndex; i++) + { + _masks[i] |= -1L; + } + + _masks[endIndex] |= endMask; + } + } + + public BitMapStruct Union(BitMapStruct other) + { + var result = new BitMapStruct(); + + ref var masks = ref _masks; + ref var otherMasks = ref other._masks; + ref var newMasks = ref result._masks; + + for (int i = 0; i < masks.Length; i++) + { + newMasks[i] = masks[i] | otherMasks[i]; + } + + return result; + } + + public void Clear(int bit) + { + int wordIndex = bit >> IntShift; + int wordBit = bit & IntMask; + + long wordMask = 1L << wordBit; + + _masks[wordIndex] &= ~wordMask; + } + + public void Clear() + { + for (int i = 0; i < _masks.Length; i++) + { + _masks[i] = 0; + } + } + + public void ClearInt(int start, int end) + { + for (int i = start; i <= end; i++) + { + _masks[i] = 0; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BufferAllocationType.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferAllocationType.cs new file mode 100644 index 000000000..29b64a585 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferAllocationType.cs @@ -0,0 +1,13 @@ +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal enum BufferAllocationType + { + Auto = 0, + + HostMappedNoCache, + HostMapped, + DeviceLocal, + DeviceLocalMapped, + Sparse, + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BufferHolder.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferHolder.cs new file mode 100644 index 000000000..adb37fc42 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferHolder.cs @@ -0,0 +1,922 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; +using VkBuffer = Silk.NET.Vulkan.Buffer; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class BufferHolder : IDisposable, IMirrorable, IMirrorable + { + private const int MaxUpdateBufferSize = 0x10000; + + private const int SetCountThreshold = 100; + private const int WriteCountThreshold = 50; + private const int FlushCountThreshold = 5; + + public const int DeviceLocalSizeThreshold = 256 * 1024; // 256kb + + public const AccessFlags DefaultAccessFlags = + AccessFlags.IndirectCommandReadBit | + AccessFlags.ShaderReadBit | + AccessFlags.ShaderWriteBit | + AccessFlags.TransferReadBit | + AccessFlags.TransferWriteBit | + AccessFlags.UniformReadBit; + + private readonly VulkanRenderer _gd; + private readonly Device _device; + private readonly MemoryAllocation _allocation; + private readonly Auto _buffer; + private readonly Auto _allocationAuto; + private readonly bool _allocationImported; + private readonly ulong _bufferHandle; + + private CacheByRange _cachedConvertedBuffers; + + public int Size { get; } + + private readonly nint _map; + + private readonly MultiFenceHolder _waitable; + + private bool _lastAccessIsWrite; + + private readonly BufferAllocationType _baseType; + private readonly BufferAllocationType _activeType; + + private readonly ReaderWriterLockSlim _flushLock; + private FenceHolder _flushFence; + private int _flushWaiting; + + private byte[] _pendingData; + private BufferMirrorRangeList _pendingDataRanges; + private Dictionary _mirrors; + private bool _useMirrors; + + public BufferHolder(VulkanRenderer gd, Device device, VkBuffer buffer, MemoryAllocation allocation, int size, BufferAllocationType type, BufferAllocationType currentType) + { + _gd = gd; + _device = device; + _allocation = allocation; + _allocationAuto = new Auto(allocation); + _waitable = new MultiFenceHolder(size); + _buffer = new Auto(new DisposableBuffer(gd.Api, device, buffer), this, _waitable, _allocationAuto); + _bufferHandle = buffer.Handle; + Size = size; + _map = allocation.HostPointer; + + _baseType = type; + _activeType = currentType; + + _flushLock = new ReaderWriterLockSlim(); + _useMirrors = gd.IsTBDR; + } + + public BufferHolder(VulkanRenderer gd, Device device, VkBuffer buffer, Auto allocation, int size, BufferAllocationType type, BufferAllocationType currentType, int offset) + { + _gd = gd; + _device = device; + _allocation = allocation.GetUnsafe(); + _allocationAuto = allocation; + _allocationImported = true; + _waitable = new MultiFenceHolder(size); + _buffer = new Auto(new DisposableBuffer(gd.Api, device, buffer), this, _waitable, _allocationAuto); + _bufferHandle = buffer.Handle; + Size = size; + _map = _allocation.HostPointer + offset; + + _baseType = type; + _activeType = currentType; + + _flushLock = new ReaderWriterLockSlim(); + } + + public BufferHolder(VulkanRenderer gd, Device device, VkBuffer buffer, int size, Auto[] storageAllocations) + { + _gd = gd; + _device = device; + _waitable = new MultiFenceHolder(size); + _buffer = new Auto(new DisposableBuffer(gd.Api, device, buffer), _waitable, storageAllocations); + _bufferHandle = buffer.Handle; + Size = size; + + _baseType = BufferAllocationType.Sparse; + _activeType = BufferAllocationType.Sparse; + + _flushLock = new ReaderWriterLockSlim(); + } + + public unsafe Auto CreateView(VkFormat format, int offset, int size, Action invalidateView) + { + var bufferViewCreateInfo = new BufferViewCreateInfo + { + SType = StructureType.BufferViewCreateInfo, + Buffer = new VkBuffer(_bufferHandle), + Format = format, + Offset = (uint)offset, + Range = (uint)size, + }; + + _gd.Api.CreateBufferView(_device, in bufferViewCreateInfo, null, out var bufferView).ThrowOnError(); + + return new Auto(new DisposableBufferView(_gd.Api, _device, bufferView), this, _waitable, _buffer); + } + + public unsafe void InsertBarrier(CommandBuffer commandBuffer, bool isWrite) + { + // If the last access is write, we always need a barrier to be sure we will read or modify + // the correct data. + // If the last access is read, and current one is a write, we need to wait until the + // read finishes to avoid overwriting data still in use. + // Otherwise, if the last access is a read and the current one too, we don't need barriers. + bool needsBarrier = isWrite || _lastAccessIsWrite; + + _lastAccessIsWrite = isWrite; + + if (needsBarrier) + { + MemoryBarrier memoryBarrier = new() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = DefaultAccessFlags, + DstAccessMask = DefaultAccessFlags, + }; + + _gd.Api.CmdPipelineBarrier( + commandBuffer, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.AllCommandsBit, + DependencyFlags.DeviceGroupBit, + 1, + in memoryBarrier, + 0, + null, + 0, + null); + } + } + + private static ulong ToMirrorKey(int offset, int size) + { + return ((ulong)offset << 32) | (uint)size; + } + + private static (int offset, int size) FromMirrorKey(ulong key) + { + return ((int)(key >> 32), (int)key); + } + + private unsafe bool TryGetMirror(CommandBufferScoped cbs, ref int offset, int size, out Auto buffer) + { + size = Math.Min(size, Size - offset); + + // Does this binding need to be mirrored? + + if (!_pendingDataRanges.OverlapsWith(offset, size)) + { + buffer = null; + return false; + } + + var key = ToMirrorKey(offset, size); + + if (_mirrors.TryGetValue(key, out StagingBufferReserved reserved)) + { + buffer = reserved.Buffer.GetBuffer(); + offset = reserved.Offset; + + return true; + } + + // Is this mirror allowed to exist? Can't be used for write in any in-flight write. + if (_waitable.IsBufferRangeInUse(offset, size, true)) + { + // Some of the data is not mirrorable, so upload the whole range. + ClearMirrors(cbs, offset, size); + + buffer = null; + return false; + } + + // Build data for the new mirror. + + var baseData = new Span((void*)(_map + offset), size); + var modData = _pendingData.AsSpan(offset, size); + + StagingBufferReserved? newMirror = _gd.BufferManager.StagingBuffer.TryReserveData(cbs, size); + + if (newMirror != null) + { + var mirror = newMirror.Value; + _pendingDataRanges.FillData(baseData, modData, offset, new Span((void*)(mirror.Buffer._map + mirror.Offset), size)); + + if (_mirrors.Count == 0) + { + _gd.PipelineInternal.RegisterActiveMirror(this); + } + + _mirrors.Add(key, mirror); + + buffer = mirror.Buffer.GetBuffer(); + offset = mirror.Offset; + + return true; + } + else + { + // Data could not be placed on the mirror, likely out of space. Force the data to flush. + ClearMirrors(cbs, offset, size); + + buffer = null; + return false; + } + } + + public Auto GetBuffer() + { + return _buffer; + } + + public Auto GetBuffer(CommandBuffer commandBuffer, bool isWrite = false, bool isSSBO = false) + { + if (isWrite) + { + SignalWrite(0, Size); + } + + return _buffer; + } + + public Auto GetBuffer(CommandBuffer commandBuffer, int offset, int size, bool isWrite = false) + { + if (isWrite) + { + SignalWrite(offset, size); + } + + return _buffer; + } + + public Auto GetMirrorable(CommandBufferScoped cbs, ref int offset, int size, out bool mirrored) + { + if (_pendingData != null && TryGetMirror(cbs, ref offset, size, out Auto result)) + { + mirrored = true; + return result; + } + + mirrored = false; + return _buffer; + } + + Auto IMirrorable.GetMirrorable(CommandBufferScoped cbs, ref int offset, int size, out bool mirrored) + { + // Cannot mirror buffer views right now. + + throw new NotImplementedException(); + } + + public void ClearMirrors() + { + // Clear mirrors without forcing a flush. This happens when the command buffer is switched, + // as all reserved areas on the staging buffer are released. + + if (_pendingData != null) + { + _mirrors.Clear(); + }; + } + + public void ClearMirrors(CommandBufferScoped cbs, int offset, int size) + { + // Clear mirrors in the given range, and submit overlapping pending data. + + if (_pendingData != null) + { + bool hadMirrors = _mirrors.Count > 0 && RemoveOverlappingMirrors(offset, size); + + if (_pendingDataRanges.Count() != 0) + { + UploadPendingData(cbs, offset, size); + } + + if (hadMirrors) + { + _gd.PipelineInternal.Rebind(_buffer, offset, size); + } + }; + } + + public void UseMirrors() + { + _useMirrors = true; + } + + private void UploadPendingData(CommandBufferScoped cbs, int offset, int size) + { + var ranges = _pendingDataRanges.FindOverlaps(offset, size); + + if (ranges != null) + { + _pendingDataRanges.Remove(offset, size); + + foreach (var range in ranges) + { + int rangeOffset = Math.Max(offset, range.Offset); + int rangeSize = Math.Min(offset + size, range.End) - rangeOffset; + + if (_gd.PipelineInternal.CurrentCommandBuffer.CommandBuffer.Handle == cbs.CommandBuffer.Handle) + { + SetData(rangeOffset, _pendingData.AsSpan(rangeOffset, rangeSize), cbs, _gd.PipelineInternal.EndRenderPassDelegate, false); + } + else + { + SetData(rangeOffset, _pendingData.AsSpan(rangeOffset, rangeSize), cbs, null, false); + } + } + } + } + + public Auto GetAllocation() + { + return _allocationAuto; + } + + public (DeviceMemory, ulong) GetDeviceMemoryAndOffset() + { + return (_allocation.Memory, _allocation.Offset); + } + + public void SignalWrite(int offset, int size) + { + if (offset == 0 && size == Size) + { + _cachedConvertedBuffers.Clear(); + } + else + { + _cachedConvertedBuffers.ClearRange(offset, size); + } + } + + public BufferHandle GetHandle() + { + var handle = _bufferHandle; + return Unsafe.As(ref handle); + } + + public nint Map(int offset, int mappingSize) + { + return _map; + } + + private void ClearFlushFence() + { + // Assumes _flushLock is held as writer. + + if (_flushFence != null) + { + if (_flushWaiting == 0) + { + _flushFence.Put(); + } + + _flushFence = null; + } + } + + private void WaitForFlushFence() + { + if (_flushFence == null) + { + return; + } + + // If storage has changed, make sure the fence has been reached so that the data is in place. + _flushLock.ExitReadLock(); + _flushLock.EnterWriteLock(); + + if (_flushFence != null) + { + var fence = _flushFence; + Interlocked.Increment(ref _flushWaiting); + + // Don't wait in the lock. + + _flushLock.ExitWriteLock(); + + fence.Wait(); + + _flushLock.EnterWriteLock(); + + if (Interlocked.Decrement(ref _flushWaiting) == 0) + { + fence.Put(); + } + + _flushFence = null; + } + + // Assumes the _flushLock is held as reader, returns in same state. + _flushLock.ExitWriteLock(); + _flushLock.EnterReadLock(); + } + + public PinnedSpan GetData(int offset, int size) + { + _flushLock.EnterReadLock(); + + WaitForFlushFence(); + + Span result; + + if (_map != nint.Zero) + { + result = GetDataStorage(offset, size); + + // Need to be careful here, the buffer can't be unmapped while the data is being used. + _buffer.IncrementReferenceCount(); + + _flushLock.ExitReadLock(); + + return PinnedSpan.UnsafeFromSpan(result, _buffer.DecrementReferenceCount); + } + + BackgroundResource resource = _gd.BackgroundResources.Get(); + + if (_gd.CommandBufferPool.OwnedByCurrentThread) + { + _gd.FlushAllCommands(); + + result = resource.GetFlushBuffer().GetBufferData(_gd.CommandBufferPool, this, offset, size); + } + else + { + result = resource.GetFlushBuffer().GetBufferData(resource.GetPool(), this, offset, size); + } + + _flushLock.ExitReadLock(); + + // Flush buffer is pinned until the next GetBufferData on the thread, which is fine for current uses. + return PinnedSpan.UnsafeFromSpan(result); + } + + public unsafe Span GetDataStorage(int offset, int size) + { + int mappingSize = Math.Min(size, Size - offset); + + if (_map != nint.Zero) + { + return new Span((void*)(_map + offset), mappingSize); + } + + throw new InvalidOperationException("The buffer is not host mapped."); + } + + public bool RemoveOverlappingMirrors(int offset, int size) + { + List toRemove = null; + foreach (var key in _mirrors.Keys) + { + (int keyOffset, int keySize) = FromMirrorKey(key); + if (!(offset + size <= keyOffset || offset >= keyOffset + keySize)) + { + toRemove ??= new List(); + + toRemove.Add(key); + } + } + + if (toRemove != null) + { + foreach (var key in toRemove) + { + _mirrors.Remove(key); + } + + return true; + } + + return false; + } + + public unsafe void SetData(int offset, ReadOnlySpan data, CommandBufferScoped? cbs = null, Action endRenderPass = null, bool allowCbsWait = true) + { + int dataSize = Math.Min(data.Length, Size - offset); + if (dataSize == 0) + { + return; + } + + bool allowMirror = _useMirrors && allowCbsWait && cbs != null && _activeType <= BufferAllocationType.HostMapped; + + if (_map != nint.Zero) + { + // If persistently mapped, set the data directly if the buffer is not currently in use. + bool isRented = _buffer.HasRentedCommandBufferDependency(_gd.CommandBufferPool); + + // If the buffer is rented, take a little more time and check if the use overlaps this handle. + bool needsFlush = isRented && _waitable.IsBufferRangeInUse(offset, dataSize, false); + + if (!needsFlush) + { + WaitForFences(offset, dataSize); + + data[..dataSize].CopyTo(new Span((void*)(_map + offset), dataSize)); + + if (_pendingData != null) + { + bool removed = _pendingDataRanges.Remove(offset, dataSize); + if (RemoveOverlappingMirrors(offset, dataSize) || removed) + { + // If any mirrors were removed, rebind the buffer range. + _gd.PipelineInternal.Rebind(_buffer, offset, dataSize); + } + } + + SignalWrite(offset, dataSize); + + return; + } + } + + // If the buffer does not have an in-flight write (including an inline update), then upload data to a pendingCopy. + if (allowMirror && !_waitable.IsBufferRangeInUse(offset, dataSize, true)) + { + if (_pendingData == null) + { + _pendingData = new byte[Size]; + _mirrors = new Dictionary(); + } + + data[..dataSize].CopyTo(_pendingData.AsSpan(offset, dataSize)); + _pendingDataRanges.Add(offset, dataSize); + + // Remove any overlapping mirrors. + RemoveOverlappingMirrors(offset, dataSize); + + // Tell the graphics device to rebind any constant buffer that overlaps the newly modified range, as it should access a mirror. + _gd.PipelineInternal.Rebind(_buffer, offset, dataSize); + + return; + } + + if (_pendingData != null) + { + _pendingDataRanges.Remove(offset, dataSize); + } + + if (cbs != null && + _gd.PipelineInternal.RenderPassActive && + !(_buffer.HasCommandBufferDependency(cbs.Value) && + _waitable.IsBufferRangeInUse(cbs.Value.CommandBufferIndex, offset, dataSize))) + { + // If the buffer hasn't been used on the command buffer yet, try to preload the data. + // This avoids ending and beginning render passes on each buffer data upload. + + cbs = _gd.PipelineInternal.GetPreloadCommandBuffer(); + endRenderPass = null; + } + + if (cbs == null || + !VulkanConfiguration.UseFastBufferUpdates || + data.Length > MaxUpdateBufferSize || + !TryPushData(cbs.Value, endRenderPass, offset, data)) + { + if (allowCbsWait) + { + _gd.BufferManager.StagingBuffer.PushData(_gd.CommandBufferPool, cbs, endRenderPass, this, offset, data); + } + else + { + bool rentCbs = cbs == null; + if (rentCbs) + { + cbs = _gd.CommandBufferPool.Rent(); + } + + if (!_gd.BufferManager.StagingBuffer.TryPushData(cbs.Value, endRenderPass, this, offset, data)) + { + // Need to do a slow upload. + BufferHolder srcHolder = _gd.BufferManager.Create(_gd, dataSize, baseType: BufferAllocationType.HostMapped); + srcHolder.SetDataUnchecked(0, data); + + var srcBuffer = srcHolder.GetBuffer(); + var dstBuffer = this.GetBuffer(cbs.Value.CommandBuffer, true); + + Copy(_gd, cbs.Value, srcBuffer, dstBuffer, 0, offset, dataSize); + + srcHolder.Dispose(); + } + + if (rentCbs) + { + cbs.Value.Dispose(); + } + } + } + } + + public unsafe void SetDataUnchecked(int offset, ReadOnlySpan data) + { + int dataSize = Math.Min(data.Length, Size - offset); + if (dataSize == 0) + { + return; + } + + if (_map != nint.Zero) + { + data[..dataSize].CopyTo(new Span((void*)(_map + offset), dataSize)); + } + else + { + _gd.BufferManager.StagingBuffer.PushData(_gd.CommandBufferPool, null, null, this, offset, data); + } + } + + public unsafe void SetDataUnchecked(int offset, ReadOnlySpan data) where T : unmanaged + { + SetDataUnchecked(offset, MemoryMarshal.AsBytes(data)); + } + + public void SetDataInline(CommandBufferScoped cbs, Action endRenderPass, int dstOffset, ReadOnlySpan data) + { + if (!TryPushData(cbs, endRenderPass, dstOffset, data)) + { + throw new ArgumentException($"Invalid offset 0x{dstOffset:X} or data size 0x{data.Length:X}."); + } + } + + private unsafe bool TryPushData(CommandBufferScoped cbs, Action endRenderPass, int dstOffset, ReadOnlySpan data) + { + if ((dstOffset & 3) != 0 || (data.Length & 3) != 0) + { + return false; + } + + endRenderPass?.Invoke(); + + var dstBuffer = GetBuffer(cbs.CommandBuffer, dstOffset, data.Length, true).Get(cbs, dstOffset, data.Length, true).Value; + + InsertBufferBarrier( + _gd, + cbs.CommandBuffer, + dstBuffer, + DefaultAccessFlags, + AccessFlags.TransferWriteBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + dstOffset, + data.Length); + + fixed (byte* pData = data) + { + for (ulong offset = 0; offset < (ulong)data.Length;) + { + ulong size = Math.Min(MaxUpdateBufferSize, (ulong)data.Length - offset); + _gd.Api.CmdUpdateBuffer(cbs.CommandBuffer, dstBuffer, (ulong)dstOffset + offset, size, pData + offset); + offset += size; + } + } + + InsertBufferBarrier( + _gd, + cbs.CommandBuffer, + dstBuffer, + AccessFlags.TransferWriteBit, + DefaultAccessFlags, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + dstOffset, + data.Length); + + return true; + } + + public static unsafe void Copy( + VulkanRenderer gd, + CommandBufferScoped cbs, + Auto src, + Auto dst, + int srcOffset, + int dstOffset, + int size, + bool registerSrcUsage = true) + { + var srcBuffer = registerSrcUsage ? src.Get(cbs, srcOffset, size).Value : src.GetUnsafe().Value; + var dstBuffer = dst.Get(cbs, dstOffset, size, true).Value; + + InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + DefaultAccessFlags, + AccessFlags.TransferWriteBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + dstOffset, + size); + + var region = new BufferCopy((ulong)srcOffset, (ulong)dstOffset, (ulong)size); + + gd.Api.CmdCopyBuffer(cbs.CommandBuffer, srcBuffer, dstBuffer, 1, ®ion); + + InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + AccessFlags.TransferWriteBit, + DefaultAccessFlags, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + dstOffset, + size); + } + + public static unsafe void InsertBufferBarrier( + VulkanRenderer gd, + CommandBuffer commandBuffer, + VkBuffer buffer, + AccessFlags srcAccessMask, + AccessFlags dstAccessMask, + PipelineStageFlags srcStageMask, + PipelineStageFlags dstStageMask, + int offset, + int size) + { + BufferMemoryBarrier memoryBarrier = new() + { + SType = StructureType.BufferMemoryBarrier, + SrcAccessMask = srcAccessMask, + DstAccessMask = dstAccessMask, + SrcQueueFamilyIndex = Vk.QueueFamilyIgnored, + DstQueueFamilyIndex = Vk.QueueFamilyIgnored, + Buffer = buffer, + Offset = (ulong)offset, + Size = (ulong)size, + }; + + gd.Api.CmdPipelineBarrier( + commandBuffer, + srcStageMask, + dstStageMask, + 0, + 0, + null, + 1, + in memoryBarrier, + 0, + null); + } + + public void WaitForFences() + { + _waitable.WaitForFences(_gd.Api, _device); + } + + public void WaitForFences(int offset, int size) + { + _waitable.WaitForFences(_gd.Api, _device, offset, size); + } + + private bool BoundToRange(int offset, ref int size) + { + if (offset >= Size) + { + return false; + } + + size = Math.Min(Size - offset, size); + + return true; + } + + public Auto GetBufferI8ToI16(CommandBufferScoped cbs, int offset, int size) + { + if (!BoundToRange(offset, ref size)) + { + return null; + } + + var key = new I8ToI16CacheKey(_gd); + + if (!_cachedConvertedBuffers.TryGetValue(offset, size, key, out var holder)) + { + holder = _gd.BufferManager.Create(_gd, (size * 2 + 3) & ~3, baseType: BufferAllocationType.DeviceLocal); + + _gd.PipelineInternal.EndRenderPass(); + _gd.HelperShader.ConvertI8ToI16(_gd, cbs, this, holder, offset, size); + + key.SetBuffer(holder.GetBuffer()); + + _cachedConvertedBuffers.Add(offset, size, key, holder); + } + + return holder.GetBuffer(); + } + + public Auto GetAlignedVertexBuffer(CommandBufferScoped cbs, int offset, int size, int stride, int alignment) + { + if (!BoundToRange(offset, ref size)) + { + return null; + } + + var key = new AlignedVertexBufferCacheKey(_gd, stride, alignment); + + if (!_cachedConvertedBuffers.TryGetValue(offset, size, key, out var holder)) + { + int alignedStride = (stride + (alignment - 1)) & -alignment; + + holder = _gd.BufferManager.Create(_gd, (size / stride) * alignedStride, baseType: BufferAllocationType.DeviceLocal); + + _gd.PipelineInternal.EndRenderPass(); + _gd.HelperShader.ChangeStride(_gd, cbs, this, holder, offset, size, stride, alignedStride); + + key.SetBuffer(holder.GetBuffer()); + + _cachedConvertedBuffers.Add(offset, size, key, holder); + } + + return holder.GetBuffer(); + } + + public Auto GetBufferTopologyConversion(CommandBufferScoped cbs, int offset, int size, IndexBufferPattern pattern, int indexSize) + { + if (!BoundToRange(offset, ref size)) + { + return null; + } + + var key = new TopologyConversionCacheKey(_gd, pattern, indexSize); + + if (!_cachedConvertedBuffers.TryGetValue(offset, size, key, out var holder)) + { + // The destination index size is always I32. + + int indexCount = size / indexSize; + + int convertedCount = pattern.GetConvertedCount(indexCount); + + holder = _gd.BufferManager.Create(_gd, convertedCount * 4, baseType: BufferAllocationType.DeviceLocal); + + _gd.PipelineInternal.EndRenderPass(); + _gd.HelperShader.ConvertIndexBuffer(_gd, cbs, this, holder, pattern, indexSize, offset, indexCount); + + key.SetBuffer(holder.GetBuffer()); + + _cachedConvertedBuffers.Add(offset, size, key, holder); + } + + return holder.GetBuffer(); + } + + public bool TryGetCachedConvertedBuffer(int offset, int size, ICacheKey key, out BufferHolder holder) + { + return _cachedConvertedBuffers.TryGetValue(offset, size, key, out holder); + } + + public void AddCachedConvertedBuffer(int offset, int size, ICacheKey key, BufferHolder holder) + { + _cachedConvertedBuffers.Add(offset, size, key, holder); + } + + public void AddCachedConvertedBufferDependency(int offset, int size, ICacheKey key, Dependency dependency) + { + _cachedConvertedBuffers.AddDependency(offset, size, key, dependency); + } + + public void RemoveCachedConvertedBuffer(int offset, int size, ICacheKey key) + { + _cachedConvertedBuffers.Remove(offset, size, key); + } + + public void Dispose() + { + _gd.PipelineInternal?.FlushCommandsIfWeightExceeding(_buffer, (ulong)Size); + + _buffer.Dispose(); + _cachedConvertedBuffers.Dispose(); + if (_allocationImported) + { + _allocationAuto.DecrementReferenceCount(); + } + else + { + _allocationAuto?.Dispose(); + } + + _flushLock.EnterWriteLock(); + + ClearFlushFence(); + + _flushLock.ExitWriteLock(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BufferManager.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferManager.cs new file mode 100644 index 000000000..0b9f62b6e --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferManager.cs @@ -0,0 +1,679 @@ +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using VkBuffer = Silk.NET.Vulkan.Buffer; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct ScopedTemporaryBuffer : IDisposable + { + private readonly BufferManager _bufferManager; + private readonly bool _isReserved; + + public readonly BufferRange Range; + public readonly BufferHolder Holder; + + public BufferHandle Handle => Range.Handle; + public int Offset => Range.Offset; + + public ScopedTemporaryBuffer(BufferManager bufferManager, BufferHolder holder, BufferHandle handle, int offset, int size, bool isReserved) + { + _bufferManager = bufferManager; + + Range = new BufferRange(handle, offset, size); + Holder = holder; + + _isReserved = isReserved; + } + + public void Dispose() + { + if (!_isReserved) + { + _bufferManager.Delete(Range.Handle); + } + } + } + + class BufferManager : IDisposable + { + public const MemoryPropertyFlags DefaultBufferMemoryFlags = + MemoryPropertyFlags.HostVisibleBit | + MemoryPropertyFlags.HostCoherentBit | + MemoryPropertyFlags.HostCachedBit; + + // Some drivers don't expose a "HostCached" memory type, + // so we need those alternative flags for the allocation to succeed there. + private const MemoryPropertyFlags DefaultBufferMemoryNoCacheFlags = + MemoryPropertyFlags.HostVisibleBit | + MemoryPropertyFlags.HostCoherentBit; + + private const MemoryPropertyFlags DeviceLocalBufferMemoryFlags = + MemoryPropertyFlags.DeviceLocalBit; + + private const MemoryPropertyFlags DeviceLocalMappedBufferMemoryFlags = + MemoryPropertyFlags.DeviceLocalBit | + MemoryPropertyFlags.HostVisibleBit | + MemoryPropertyFlags.HostCoherentBit; + + private const BufferUsageFlags DefaultBufferUsageFlags = + BufferUsageFlags.TransferSrcBit | + BufferUsageFlags.TransferDstBit | + BufferUsageFlags.UniformTexelBufferBit | + BufferUsageFlags.StorageTexelBufferBit | + BufferUsageFlags.UniformBufferBit | + BufferUsageFlags.StorageBufferBit | + BufferUsageFlags.IndexBufferBit | + BufferUsageFlags.VertexBufferBit | + BufferUsageFlags.TransformFeedbackBufferBitExt; + + private const BufferUsageFlags HostImportedBufferUsageFlags = + BufferUsageFlags.TransferSrcBit | + BufferUsageFlags.TransferDstBit; + + private readonly Device _device; + + private readonly IdList _buffers; + + public int BufferCount { get; private set; } + + public StagingBuffer StagingBuffer { get; } + + public MemoryRequirements HostImportedBufferMemoryRequirements { get; } + + public BufferManager(VulkanRenderer gd, Device device) + { + _device = device; + _buffers = new IdList(); + StagingBuffer = new StagingBuffer(gd, this); + + HostImportedBufferMemoryRequirements = GetHostImportedUsageRequirements(gd); + } + + public unsafe BufferHandle CreateHostImported(VulkanRenderer gd, nint pointer, int size) + { + var usage = HostImportedBufferUsageFlags; + + if (gd.Capabilities.SupportsIndirectParameters) + { + usage |= BufferUsageFlags.IndirectBufferBit; + } + + var externalMemoryBuffer = new ExternalMemoryBufferCreateInfo + { + SType = StructureType.ExternalMemoryBufferCreateInfo, + HandleTypes = ExternalMemoryHandleTypeFlags.HostAllocationBitExt, + }; + + var bufferCreateInfo = new BufferCreateInfo + { + SType = StructureType.BufferCreateInfo, + Size = (ulong)size, + Usage = usage, + SharingMode = SharingMode.Exclusive, + PNext = &externalMemoryBuffer, + }; + + gd.Api.CreateBuffer(_device, in bufferCreateInfo, null, out var buffer).ThrowOnError(); + + (Auto allocation, ulong offset) = gd.HostMemoryAllocator.GetExistingAllocation(pointer, (ulong)size); + + gd.Api.BindBufferMemory(_device, buffer, allocation.GetUnsafe().Memory, allocation.GetUnsafe().Offset + offset); + + var holder = new BufferHolder(gd, _device, buffer, allocation, size, BufferAllocationType.HostMapped, BufferAllocationType.HostMapped, (int)offset); + + BufferCount++; + + ulong handle64 = (uint)_buffers.Add(holder); + + return Unsafe.As(ref handle64); + } + + public unsafe BufferHandle CreateSparse(VulkanRenderer gd, ReadOnlySpan storageBuffers) + { + var usage = DefaultBufferUsageFlags; + + if (gd.Capabilities.SupportsIndirectParameters) + { + usage |= BufferUsageFlags.IndirectBufferBit; + } + + ulong size = 0; + + foreach (BufferRange range in storageBuffers) + { + size += (ulong)range.Size; + } + + var bufferCreateInfo = new BufferCreateInfo() + { + SType = StructureType.BufferCreateInfo, + Size = size, + Usage = usage, + SharingMode = SharingMode.Exclusive, + Flags = BufferCreateFlags.SparseBindingBit | BufferCreateFlags.SparseAliasedBit + }; + + gd.Api.CreateBuffer(_device, in bufferCreateInfo, null, out var buffer).ThrowOnError(); + + var memoryBinds = new SparseMemoryBind[storageBuffers.Length]; + var storageAllocations = new Auto[storageBuffers.Length]; + int storageAllocationsCount = 0; + + ulong dstOffset = 0; + + for (int index = 0; index < storageBuffers.Length; index++) + { + BufferRange range = storageBuffers[index]; + + if (TryGetBuffer(range.Handle, out var existingHolder)) + { + (var memory, var offset) = existingHolder.GetDeviceMemoryAndOffset(); + + memoryBinds[index] = new SparseMemoryBind() + { + ResourceOffset = dstOffset, + Size = (ulong)range.Size, + Memory = memory, + MemoryOffset = offset + (ulong)range.Offset, + Flags = SparseMemoryBindFlags.None + }; + + storageAllocations[storageAllocationsCount++] = existingHolder.GetAllocation(); + } + else + { + memoryBinds[index] = new SparseMemoryBind() + { + ResourceOffset = dstOffset, + Size = (ulong)range.Size, + Memory = default, + MemoryOffset = 0UL, + Flags = SparseMemoryBindFlags.None + }; + } + + dstOffset += (ulong)range.Size; + } + + if (storageAllocations.Length != storageAllocationsCount) + { + Array.Resize(ref storageAllocations, storageAllocationsCount); + } + + fixed (SparseMemoryBind* pMemoryBinds = memoryBinds) + { + SparseBufferMemoryBindInfo bufferBind = new SparseBufferMemoryBindInfo() + { + Buffer = buffer, + BindCount = (uint)memoryBinds.Length, + PBinds = pMemoryBinds + }; + + BindSparseInfo bindSparseInfo = new BindSparseInfo() + { + SType = StructureType.BindSparseInfo, + BufferBindCount = 1, + PBufferBinds = &bufferBind + }; + + gd.Api.QueueBindSparse(gd.Queue, 1, in bindSparseInfo, default).ThrowOnError(); + } + + var holder = new BufferHolder(gd, _device, buffer, (int)size, storageAllocations); + + BufferCount++; + + ulong handle64 = (uint)_buffers.Add(holder); + + return Unsafe.As(ref handle64); + } + + public BufferHandle CreateWithHandle( + VulkanRenderer gd, + int size, + bool sparseCompatible = false, + BufferAllocationType baseType = BufferAllocationType.HostMapped, + bool forceMirrors = false) + { + return CreateWithHandle(gd, size, out _, sparseCompatible, baseType, forceMirrors); + } + + public BufferHandle CreateWithHandle( + VulkanRenderer gd, + int size, + out BufferHolder holder, + bool sparseCompatible = false, + BufferAllocationType baseType = BufferAllocationType.HostMapped, + bool forceMirrors = false) + { + holder = Create(gd, size, forConditionalRendering: false, sparseCompatible, baseType); + if (holder == null) + { + return BufferHandle.Null; + } + + if (forceMirrors) + { + holder.UseMirrors(); + } + + BufferCount++; + + ulong handle64 = (uint)_buffers.Add(holder); + + return Unsafe.As(ref handle64); + } + + public ScopedTemporaryBuffer ReserveOrCreate(VulkanRenderer gd, CommandBufferScoped cbs, int size) + { + StagingBufferReserved? result = StagingBuffer.TryReserveData(cbs, size); + + if (result.HasValue) + { + return new ScopedTemporaryBuffer(this, result.Value.Buffer, StagingBuffer.Handle, result.Value.Offset, result.Value.Size, true); + } + else + { + // Create a temporary buffer. + BufferHandle handle = CreateWithHandle(gd, size, out BufferHolder holder); + + return new ScopedTemporaryBuffer(this, holder, handle, 0, size, false); + } + } + + public unsafe MemoryRequirements GetHostImportedUsageRequirements(VulkanRenderer gd) + { + var usage = HostImportedBufferUsageFlags; + + if (gd.Capabilities.SupportsIndirectParameters) + { + usage |= BufferUsageFlags.IndirectBufferBit; + } + + var bufferCreateInfo = new BufferCreateInfo + { + SType = StructureType.BufferCreateInfo, + Size = (ulong)Environment.SystemPageSize, + Usage = usage, + SharingMode = SharingMode.Exclusive, + }; + + gd.Api.CreateBuffer(_device, in bufferCreateInfo, null, out var buffer).ThrowOnError(); + + gd.Api.GetBufferMemoryRequirements(_device, buffer, out var requirements); + + gd.Api.DestroyBuffer(_device, buffer, null); + + return requirements; + } + + public unsafe (VkBuffer buffer, MemoryAllocation allocation, BufferAllocationType resultType) CreateBacking( + VulkanRenderer gd, + int size, + BufferAllocationType type, + bool forConditionalRendering = false, + bool sparseCompatible = false, + BufferAllocationType fallbackType = BufferAllocationType.Auto) + { + var usage = DefaultBufferUsageFlags; + + if (forConditionalRendering && gd.Capabilities.SupportsConditionalRendering) + { + usage |= BufferUsageFlags.ConditionalRenderingBitExt; + } + else if (gd.Capabilities.SupportsIndirectParameters) + { + usage |= BufferUsageFlags.IndirectBufferBit; + } + + var bufferCreateInfo = new BufferCreateInfo + { + SType = StructureType.BufferCreateInfo, + Size = (ulong)size, + Usage = usage, + SharingMode = SharingMode.Exclusive, + }; + + gd.Api.CreateBuffer(_device, in bufferCreateInfo, null, out var buffer).ThrowOnError(); + gd.Api.GetBufferMemoryRequirements(_device, buffer, out var requirements); + + if (sparseCompatible) + { + requirements.Alignment = Math.Max(requirements.Alignment, Constants.SparseBufferAlignment); + } + + MemoryAllocation allocation; + + do + { + var allocateFlags = type switch + { + BufferAllocationType.HostMappedNoCache => DefaultBufferMemoryNoCacheFlags, + BufferAllocationType.HostMapped => DefaultBufferMemoryFlags, + BufferAllocationType.DeviceLocal => DeviceLocalBufferMemoryFlags, + BufferAllocationType.DeviceLocalMapped => DeviceLocalMappedBufferMemoryFlags, + _ => DefaultBufferMemoryFlags, + }; + + // If an allocation with this memory type fails, fall back to the previous one. + try + { + allocation = gd.MemoryAllocator.AllocateDeviceMemory(requirements, allocateFlags, true); + } + catch (VulkanException) + { + allocation = default; + } + } + while (allocation.Memory.Handle == 0 && (--type != fallbackType)); + + if (allocation.Memory.Handle == 0UL) + { + gd.Api.DestroyBuffer(_device, buffer, null); + return default; + } + + gd.Api.BindBufferMemory(_device, buffer, allocation.Memory, allocation.Offset); + + return (buffer, allocation, type); + } + + public BufferHolder Create( + VulkanRenderer gd, + int size, + bool forConditionalRendering = false, + bool sparseCompatible = false, + BufferAllocationType baseType = BufferAllocationType.HostMapped) + { + BufferAllocationType type = baseType; + + if (baseType == BufferAllocationType.Auto) + { + type = BufferAllocationType.HostMapped; + } + + (VkBuffer buffer, MemoryAllocation allocation, BufferAllocationType resultType) = + CreateBacking(gd, size, type, forConditionalRendering, sparseCompatible); + + if (buffer.Handle != 0) + { + var holder = new BufferHolder(gd, _device, buffer, allocation, size, baseType, resultType); + + return holder; + } + + Logger.Error?.Print(LogClass.Gpu, $"Failed to create buffer with size 0x{size:X} and type \"{baseType}\"."); + + return null; + } + + public Auto CreateView(BufferHandle handle, VkFormat format, int offset, int size, Action invalidateView) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.CreateView(format, offset, size, invalidateView); + } + + return null; + } + + public Auto GetBuffer(CommandBuffer commandBuffer, BufferHandle handle, bool isWrite, bool isSSBO = false) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.GetBuffer(commandBuffer, isWrite, isSSBO); + } + + return null; + } + + public Auto GetBuffer(CommandBuffer commandBuffer, BufferHandle handle, int offset, int size, bool isWrite) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.GetBuffer(commandBuffer, offset, size, isWrite); + } + + return null; + } + + public Auto GetBufferI8ToI16(CommandBufferScoped cbs, BufferHandle handle, int offset, int size) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.GetBufferI8ToI16(cbs, offset, size); + } + + return null; + } + + public Auto GetAlignedVertexBuffer(CommandBufferScoped cbs, BufferHandle handle, int offset, int size, int stride, int alignment) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.GetAlignedVertexBuffer(cbs, offset, size, stride, alignment); + } + + return null; + } + + public Auto GetBufferTopologyConversion(CommandBufferScoped cbs, BufferHandle handle, int offset, int size, IndexBufferPattern pattern, int indexSize) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.GetBufferTopologyConversion(cbs, offset, size, pattern, indexSize); + } + + return null; + } + + public (Auto, Auto) GetBufferTopologyConversionIndirect( + VulkanRenderer gd, + CommandBufferScoped cbs, + BufferRange indexBuffer, + BufferRange indirectBuffer, + BufferRange drawCountBuffer, + IndexBufferPattern pattern, + int indexSize, + bool hasDrawCount, + int maxDrawCount, + int indirectDataStride) + { + BufferHolder drawCountBufferHolder = null; + + if (!TryGetBuffer(indexBuffer.Handle, out var indexBufferHolder) || + !TryGetBuffer(indirectBuffer.Handle, out var indirectBufferHolder) || + (hasDrawCount && !TryGetBuffer(drawCountBuffer.Handle, out drawCountBufferHolder))) + { + return (null, null); + } + + var indexBufferKey = new TopologyConversionIndirectCacheKey( + gd, + pattern, + indexSize, + indirectBufferHolder, + indirectBuffer.Offset, + indirectBuffer.Size); + + bool hasConvertedIndexBuffer = indexBufferHolder.TryGetCachedConvertedBuffer( + indexBuffer.Offset, + indexBuffer.Size, + indexBufferKey, + out var convertedIndexBuffer); + + var indirectBufferKey = new IndirectDataCacheKey(pattern); + bool hasConvertedIndirectBuffer = indirectBufferHolder.TryGetCachedConvertedBuffer( + indirectBuffer.Offset, + indirectBuffer.Size, + indirectBufferKey, + out var convertedIndirectBuffer); + + var drawCountBufferKey = new DrawCountCacheKey(); + bool hasCachedDrawCount = true; + + if (hasDrawCount) + { + hasCachedDrawCount = drawCountBufferHolder.TryGetCachedConvertedBuffer( + drawCountBuffer.Offset, + drawCountBuffer.Size, + drawCountBufferKey, + out _); + } + + if (!hasConvertedIndexBuffer || !hasConvertedIndirectBuffer || !hasCachedDrawCount) + { + // The destination index size is always I32. + + int indexCount = indexBuffer.Size / indexSize; + + int convertedCount = pattern.GetConvertedCount(indexCount); + + if (!hasConvertedIndexBuffer) + { + convertedIndexBuffer = Create(gd, convertedCount * 4); + indexBufferKey.SetBuffer(convertedIndexBuffer.GetBuffer()); + indexBufferHolder.AddCachedConvertedBuffer(indexBuffer.Offset, indexBuffer.Size, indexBufferKey, convertedIndexBuffer); + } + + if (!hasConvertedIndirectBuffer) + { + convertedIndirectBuffer = Create(gd, indirectBuffer.Size); + indirectBufferHolder.AddCachedConvertedBuffer(indirectBuffer.Offset, indirectBuffer.Size, indirectBufferKey, convertedIndirectBuffer); + } + + gd.PipelineInternal.EndRenderPass(); + gd.HelperShader.ConvertIndexBufferIndirect( + gd, + cbs, + indirectBufferHolder, + convertedIndirectBuffer, + drawCountBuffer, + indexBufferHolder, + convertedIndexBuffer, + pattern, + indexSize, + indexBuffer.Offset, + indexBuffer.Size, + indirectBuffer.Offset, + hasDrawCount, + maxDrawCount, + indirectDataStride); + + // Any modification of the indirect buffer should invalidate the index buffers that are associated with it, + // since we used the indirect data to find the range of the index buffer that is used. + + var indexBufferDependency = new Dependency( + indexBufferHolder, + indexBuffer.Offset, + indexBuffer.Size, + indexBufferKey); + + indirectBufferHolder.AddCachedConvertedBufferDependency( + indirectBuffer.Offset, + indirectBuffer.Size, + indirectBufferKey, + indexBufferDependency); + + if (hasDrawCount) + { + if (!hasCachedDrawCount) + { + drawCountBufferHolder.AddCachedConvertedBuffer(drawCountBuffer.Offset, drawCountBuffer.Size, drawCountBufferKey, null); + } + + // If we have a draw count, any modification of the draw count should invalidate all indirect buffers + // where we used it to find the range of indirect data that is actually used. + + var indirectBufferDependency = new Dependency( + indirectBufferHolder, + indirectBuffer.Offset, + indirectBuffer.Size, + indirectBufferKey); + + drawCountBufferHolder.AddCachedConvertedBufferDependency( + drawCountBuffer.Offset, + drawCountBuffer.Size, + drawCountBufferKey, + indirectBufferDependency); + } + } + + return (convertedIndexBuffer.GetBuffer(), convertedIndirectBuffer.GetBuffer()); + } + + public Auto GetBuffer(CommandBuffer commandBuffer, BufferHandle handle, bool isWrite, out int size) + { + if (TryGetBuffer(handle, out var holder)) + { + size = holder.Size; + return holder.GetBuffer(commandBuffer, isWrite); + } + + size = 0; + return null; + } + + public PinnedSpan GetData(BufferHandle handle, int offset, int size) + { + if (TryGetBuffer(handle, out var holder)) + { + return holder.GetData(offset, size); + } + + return new PinnedSpan(); + } + + public void SetData(BufferHandle handle, int offset, ReadOnlySpan data) where T : unmanaged + { + SetData(handle, offset, MemoryMarshal.Cast(data), null, null); + } + + public void SetData(BufferHandle handle, int offset, ReadOnlySpan data, CommandBufferScoped? cbs, Action endRenderPass) + { + if (TryGetBuffer(handle, out var holder)) + { + holder.SetData(offset, data, cbs, endRenderPass); + } + } + + public void Delete(BufferHandle handle) + { + if (TryGetBuffer(handle, out var holder)) + { + holder.Dispose(); + _buffers.Remove((int)Unsafe.As(ref handle)); + } + } + + private bool TryGetBuffer(BufferHandle handle, out BufferHolder holder) + { + return _buffers.TryGetValue((int)Unsafe.As(ref handle), out holder); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + StagingBuffer.Dispose(); + + foreach (BufferHolder buffer in _buffers) + { + buffer.Dispose(); + } + + _buffers.Clear(); + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BufferMirrorRangeList.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferMirrorRangeList.cs new file mode 100644 index 000000000..5c92e7816 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferMirrorRangeList.cs @@ -0,0 +1,305 @@ +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + /// + /// A structure tracking pending upload ranges for buffers. + /// Where a range is present, pending data exists that can either be used to build mirrors + /// or upload directly to the buffer. + /// + struct BufferMirrorRangeList + { + internal readonly struct Range + { + public int Offset { get; } + public int Size { get; } + + public int End => Offset + Size; + + public Range(int offset, int size) + { + Offset = offset; + Size = size; + } + + public bool OverlapsWith(int offset, int size) + { + return Offset < offset + size && offset < Offset + Size; + } + } + + private List _ranges; + + public readonly IEnumerable All() + { + return _ranges; + } + + public readonly bool Remove(int offset, int size) + { + var list = _ranges; + bool removedAny = false; + if (list != null) + { + int overlapIndex = BinarySearch(list, offset, size); + + if (overlapIndex >= 0) + { + // Overlaps with a range. Search back to find the first one it doesn't overlap with. + + while (overlapIndex > 0 && list[overlapIndex - 1].OverlapsWith(offset, size)) + { + overlapIndex--; + } + + int endOffset = offset + size; + int startIndex = overlapIndex; + + var currentOverlap = list[overlapIndex]; + + // Orphan the start of the overlap. + if (currentOverlap.Offset < offset) + { + list[overlapIndex] = new Range(currentOverlap.Offset, offset - currentOverlap.Offset); + currentOverlap = new Range(offset, currentOverlap.End - offset); + list.Insert(++overlapIndex, currentOverlap); + startIndex++; + + removedAny = true; + } + + // Remove any middle overlaps. + while (currentOverlap.Offset < endOffset) + { + if (currentOverlap.End > endOffset) + { + // Update the end overlap instead of removing it, if it spans beyond the removed range. + list[overlapIndex] = new Range(endOffset, currentOverlap.End - endOffset); + + removedAny = true; + break; + } + + if (++overlapIndex >= list.Count) + { + break; + } + + currentOverlap = list[overlapIndex]; + } + + int count = overlapIndex - startIndex; + + list.RemoveRange(startIndex, count); + + removedAny |= count > 0; + } + } + + return removedAny; + } + + public void Add(int offset, int size) + { + var list = _ranges; + if (list != null) + { + int overlapIndex = BinarySearch(list, offset, size); + if (overlapIndex >= 0) + { + while (overlapIndex > 0 && list[overlapIndex - 1].OverlapsWith(offset, size)) + { + overlapIndex--; + } + + int endOffset = offset + size; + int startIndex = overlapIndex; + + while (overlapIndex < list.Count && list[overlapIndex].OverlapsWith(offset, size)) + { + var currentOverlap = list[overlapIndex]; + var currentOverlapEndOffset = currentOverlap.Offset + currentOverlap.Size; + + if (offset > currentOverlap.Offset) + { + offset = currentOverlap.Offset; + } + + if (endOffset < currentOverlapEndOffset) + { + endOffset = currentOverlapEndOffset; + } + + overlapIndex++; + size = endOffset - offset; + } + + int count = overlapIndex - startIndex; + + list.RemoveRange(startIndex, count); + + overlapIndex = startIndex; + } + else + { + overlapIndex = ~overlapIndex; + } + + list.Insert(overlapIndex, new Range(offset, size)); + } + else + { + _ranges = new List + { + new Range(offset, size) + }; + } + } + + public readonly bool OverlapsWith(int offset, int size) + { + var list = _ranges; + if (list == null) + { + return false; + } + + return BinarySearch(list, offset, size) >= 0; + } + + public readonly List FindOverlaps(int offset, int size) + { + var list = _ranges; + if (list == null) + { + return null; + } + + List result = null; + + int index = BinarySearch(list, offset, size); + + if (index >= 0) + { + while (index > 0 && list[index - 1].OverlapsWith(offset, size)) + { + index--; + } + + do + { + (result ??= []).Add(list[index++]); + } + while (index < list.Count && list[index].OverlapsWith(offset, size)); + } + + return result; + } + + private static int BinarySearch(List list, int offset, int size) + { + int left = 0; + int right = list.Count - 1; + + while (left <= right) + { + int range = right - left; + + int middle = left + (range >> 1); + + var item = list[middle]; + + if (item.OverlapsWith(offset, size)) + { + return middle; + } + + if (offset < item.Offset) + { + right = middle - 1; + } + else + { + left = middle + 1; + } + } + + return ~left; + } + + public readonly void FillData(Span baseData, Span modData, int offset, Span result) + { + int size = baseData.Length; + int endOffset = offset + size; + + var list = _ranges; + if (list == null) + { + baseData.CopyTo(result); + } + + int srcOffset = offset; + int dstOffset = 0; + bool activeRange = false; + + for (int i = 0; i < list.Count; i++) + { + var range = list[i]; + + int rangeEnd = range.Offset + range.Size; + + if (activeRange) + { + if (range.Offset >= endOffset) + { + break; + } + } + else + { + if (rangeEnd <= offset) + { + continue; + } + + activeRange = true; + } + + int baseSize = range.Offset - srcOffset; + + if (baseSize > 0) + { + baseData.Slice(dstOffset, baseSize).CopyTo(result.Slice(dstOffset, baseSize)); + srcOffset += baseSize; + dstOffset += baseSize; + } + + int modSize = Math.Min(rangeEnd - srcOffset, endOffset - srcOffset); + if (modSize != 0) + { + modData.Slice(dstOffset, modSize).CopyTo(result.Slice(dstOffset, modSize)); + srcOffset += modSize; + dstOffset += modSize; + } + } + + int baseSizeEnd = endOffset - srcOffset; + + if (baseSizeEnd > 0) + { + baseData.Slice(dstOffset, baseSizeEnd).CopyTo(result.Slice(dstOffset, baseSizeEnd)); + } + } + + public readonly int Count() + { + return _ranges?.Count ?? 0; + } + + public void Clear() + { + _ranges = null; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BufferState.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferState.cs new file mode 100644 index 000000000..91dec6dc0 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferState.cs @@ -0,0 +1,56 @@ +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + struct BufferState : IDisposable + { + public static BufferState Null => new(null, 0, 0); + + private readonly int _offset; + private readonly int _size; + + private Auto _buffer; + + public BufferState(Auto buffer, int offset, int size) + { + _buffer = buffer; + _offset = offset; + _size = size; + buffer?.IncrementReferenceCount(); + } + + public readonly void BindTransformFeedbackBuffer(VulkanRenderer gd, CommandBufferScoped cbs, uint binding) + { + if (_buffer != null) + { + var buffer = _buffer.Get(cbs, _offset, _size, true).Value; + + ulong offset = (ulong)_offset; + ulong size = (ulong)_size; + + gd.TransformFeedbackApi.CmdBindTransformFeedbackBuffers(cbs.CommandBuffer, binding, 1, in buffer, in offset, in size); + } + } + + public void Swap(Auto from, Auto to) + { + if (_buffer == from) + { + _buffer.DecrementReferenceCount(); + to.IncrementReferenceCount(); + + _buffer = to; + } + } + + public readonly bool Overlaps(Auto buffer, int offset, int size) + { + return buffer == _buffer && offset < _offset + _size && offset + size > _offset; + } + + public readonly void Dispose() + { + _buffer?.DecrementReferenceCount(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/BufferUsageBitmap.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferUsageBitmap.cs new file mode 100644 index 000000000..68275b71d --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/BufferUsageBitmap.cs @@ -0,0 +1,82 @@ +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class BufferUsageBitmap + { + private readonly BitMap _bitmap; + private readonly int _size; + private readonly int _granularity; + private readonly int _bits; + private readonly int _writeBitOffset; + + private readonly int _intsPerCb; + private readonly int _bitsPerCb; + + public BufferUsageBitmap(int size, int granularity) + { + _size = size; + _granularity = granularity; + + // There are two sets of bits - one for read tracking, and the other for write. + int bits = (size + (granularity - 1)) / granularity; + _writeBitOffset = bits; + _bits = bits << 1; + + _intsPerCb = (_bits + (BitMap.IntSize - 1)) / BitMap.IntSize; + _bitsPerCb = _intsPerCb * BitMap.IntSize; + + _bitmap = new BitMap(_bitsPerCb * CommandBufferPool.MaxCommandBuffers); + } + + public void Add(int cbIndex, int offset, int size, bool write) + { + if (size == 0) + { + return; + } + + // Some usages can be out of bounds (vertex buffer on amd), so bound if necessary. + if (offset + size > _size) + { + size = _size - offset; + } + + int cbBase = cbIndex * _bitsPerCb + (write ? _writeBitOffset : 0); + int start = cbBase + offset / _granularity; + int end = cbBase + (offset + size - 1) / _granularity; + + _bitmap.SetRange(start, end); + } + + public bool OverlapsWith(int cbIndex, int offset, int size, bool write = false) + { + if (size == 0) + { + return false; + } + + int cbBase = cbIndex * _bitsPerCb + (write ? _writeBitOffset : 0); + int start = cbBase + offset / _granularity; + int end = cbBase + (offset + size - 1) / _granularity; + + return _bitmap.IsSet(start, end); + } + + public bool OverlapsWith(int offset, int size, bool write) + { + for (int i = 0; i < CommandBufferPool.MaxCommandBuffers; i++) + { + if (OverlapsWith(i, offset, size, write)) + { + return true; + } + } + + return false; + } + + public void Clear(int cbIndex) + { + _bitmap.ClearInt(cbIndex * _intsPerCb, (cbIndex + 1) * _intsPerCb - 1); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/CacheByRange.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/CacheByRange.cs new file mode 100644 index 000000000..2c19b9487 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/CacheByRange.cs @@ -0,0 +1,394 @@ +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + interface ICacheKey : IDisposable + { + bool KeyEqual(ICacheKey other); + } + + struct I8ToI16CacheKey : ICacheKey + { + // Used to notify the pipeline that bindings have invalidated on dispose. + private readonly VulkanRenderer _gd; + private Auto _buffer; + + public I8ToI16CacheKey(VulkanRenderer gd) + { + _gd = gd; + _buffer = null; + } + + public readonly bool KeyEqual(ICacheKey other) + { + return other is I8ToI16CacheKey; + } + + public void SetBuffer(Auto buffer) + { + _buffer = buffer; + } + + public readonly void Dispose() + { + _gd.PipelineInternal.DirtyIndexBuffer(_buffer); + } + } + + struct AlignedVertexBufferCacheKey : ICacheKey + { + private readonly int _stride; + private readonly int _alignment; + + // Used to notify the pipeline that bindings have invalidated on dispose. + private readonly VulkanRenderer _gd; + private Auto _buffer; + + public AlignedVertexBufferCacheKey(VulkanRenderer gd, int stride, int alignment) + { + _gd = gd; + _stride = stride; + _alignment = alignment; + _buffer = null; + } + + public readonly bool KeyEqual(ICacheKey other) + { + return other is AlignedVertexBufferCacheKey entry && + entry._stride == _stride && + entry._alignment == _alignment; + } + + public void SetBuffer(Auto buffer) + { + _buffer = buffer; + } + + public readonly void Dispose() + { + _gd.PipelineInternal.DirtyVertexBuffer(_buffer); + } + } + + struct TopologyConversionCacheKey : ICacheKey + { + private readonly IndexBufferPattern _pattern; + private readonly int _indexSize; + + // Used to notify the pipeline that bindings have invalidated on dispose. + private readonly VulkanRenderer _gd; + private Auto _buffer; + + public TopologyConversionCacheKey(VulkanRenderer gd, IndexBufferPattern pattern, int indexSize) + { + _gd = gd; + _pattern = pattern; + _indexSize = indexSize; + _buffer = null; + } + + public readonly bool KeyEqual(ICacheKey other) + { + return other is TopologyConversionCacheKey entry && + entry._pattern == _pattern && + entry._indexSize == _indexSize; + } + + public void SetBuffer(Auto buffer) + { + _buffer = buffer; + } + + public readonly void Dispose() + { + _gd.PipelineInternal.DirtyIndexBuffer(_buffer); + } + } + + readonly struct TopologyConversionIndirectCacheKey : ICacheKey + { + private readonly TopologyConversionCacheKey _baseKey; + private readonly BufferHolder _indirectDataBuffer; + private readonly int _indirectDataOffset; + private readonly int _indirectDataSize; + + public TopologyConversionIndirectCacheKey( + VulkanRenderer gd, + IndexBufferPattern pattern, + int indexSize, + BufferHolder indirectDataBuffer, + int indirectDataOffset, + int indirectDataSize) + { + _baseKey = new TopologyConversionCacheKey(gd, pattern, indexSize); + _indirectDataBuffer = indirectDataBuffer; + _indirectDataOffset = indirectDataOffset; + _indirectDataSize = indirectDataSize; + } + + public bool KeyEqual(ICacheKey other) + { + return other is TopologyConversionIndirectCacheKey entry && + entry._baseKey.KeyEqual(_baseKey) && + entry._indirectDataBuffer == _indirectDataBuffer && + entry._indirectDataOffset == _indirectDataOffset && + entry._indirectDataSize == _indirectDataSize; + } + + public void SetBuffer(Auto buffer) + { + _baseKey.SetBuffer(buffer); + } + + public void Dispose() + { + _baseKey.Dispose(); + } + } + + readonly struct IndirectDataCacheKey : ICacheKey + { + private readonly IndexBufferPattern _pattern; + + public IndirectDataCacheKey(IndexBufferPattern pattern) + { + _pattern = pattern; + } + + public bool KeyEqual(ICacheKey other) + { + return other is IndirectDataCacheKey entry && entry._pattern == _pattern; + } + + public void Dispose() + { + } + } + + struct DrawCountCacheKey : ICacheKey + { + public readonly bool KeyEqual(ICacheKey other) + { + return other is DrawCountCacheKey; + } + + public readonly void Dispose() + { + } + } + + readonly struct Dependency + { + private readonly BufferHolder _buffer; + private readonly int _offset; + private readonly int _size; + private readonly ICacheKey _key; + + public Dependency(BufferHolder buffer, int offset, int size, ICacheKey key) + { + _buffer = buffer; + _offset = offset; + _size = size; + _key = key; + } + + public void RemoveFromOwner() + { + _buffer.RemoveCachedConvertedBuffer(_offset, _size, _key); + } + } + + struct CacheByRange where T : IDisposable + { + private struct Entry + { + public ICacheKey Key; + public T Value; + public List DependencyList; + + public Entry(ICacheKey key, T value) + { + Key = key; + Value = value; + DependencyList = null; + } + + public readonly void InvalidateDependencies() + { + if (DependencyList != null) + { + foreach (Dependency dependency in DependencyList) + { + dependency.RemoveFromOwner(); + } + + DependencyList.Clear(); + } + } + } + + private Dictionary> _ranges; + + public void Add(int offset, int size, ICacheKey key, T value) + { + List entries = GetEntries(offset, size); + + entries.Add(new Entry(key, value)); + } + + public void AddDependency(int offset, int size, ICacheKey key, Dependency dependency) + { + List entries = GetEntries(offset, size); + + for (int i = 0; i < entries.Count; i++) + { + Entry entry = entries[i]; + + if (entry.Key.KeyEqual(key)) + { + if (entry.DependencyList == null) + { + entry.DependencyList = new List(); + entries[i] = entry; + } + + entry.DependencyList.Add(dependency); + + break; + } + } + } + + public void Remove(int offset, int size, ICacheKey key) + { + List entries = GetEntries(offset, size); + + for (int i = 0; i < entries.Count; i++) + { + Entry entry = entries[i]; + + if (entry.Key.KeyEqual(key)) + { + entries.RemoveAt(i--); + + DestroyEntry(entry); + } + } + + if (entries.Count == 0) + { + _ranges.Remove(PackRange(offset, size)); + } + } + + public bool TryGetValue(int offset, int size, ICacheKey key, out T value) + { + List entries = GetEntries(offset, size); + + foreach (Entry entry in entries) + { + if (entry.Key.KeyEqual(key)) + { + value = entry.Value; + + return true; + } + } + + value = default; + return false; + } + + public void Clear() + { + if (_ranges != null) + { + foreach (List entries in _ranges.Values) + { + foreach (Entry entry in entries) + { + DestroyEntry(entry); + } + } + + _ranges.Clear(); + _ranges = null; + } + } + + public readonly void ClearRange(int offset, int size) + { + if (_ranges != null && _ranges.Count > 0) + { + int end = offset + size; + + List toRemove = null; + + foreach (KeyValuePair> range in _ranges) + { + (int rOffset, int rSize) = UnpackRange(range.Key); + + int rEnd = rOffset + rSize; + + if (rEnd > offset && rOffset < end) + { + List entries = range.Value; + + foreach (Entry entry in entries) + { + DestroyEntry(entry); + } + + (toRemove ??= new List()).Add(range.Key); + } + } + + if (toRemove != null) + { + foreach (ulong range in toRemove) + { + _ranges.Remove(range); + } + } + } + } + + private List GetEntries(int offset, int size) + { + _ranges ??= new Dictionary>(); + + ulong key = PackRange(offset, size); + + if (!_ranges.TryGetValue(key, out List value)) + { + value = new List(); + _ranges.Add(key, value); + } + + return value; + } + + private static void DestroyEntry(Entry entry) + { + entry.Key.Dispose(); + entry.Value?.Dispose(); + entry.InvalidateDependencies(); + } + + private static ulong PackRange(int offset, int size) + { + return (uint)offset | ((ulong)size << 32); + } + + private static (int offset, int size) UnpackRange(ulong range) + { + return ((int)range, (int)(range >> 32)); + } + + public void Dispose() + { + Clear(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferPool.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferPool.cs new file mode 100644 index 000000000..14fee1477 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferPool.cs @@ -0,0 +1,370 @@ +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; +using Semaphore = Silk.NET.Vulkan.Semaphore; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class CommandBufferPool : IDisposable + { + public const int MaxCommandBuffers = 16; + + private readonly int _totalCommandBuffers; + private readonly int _totalCommandBuffersMask; + + private readonly Vk _api; + private readonly Device _device; + private readonly Queue _queue; + private readonly Lock _queueLock; + private readonly bool _concurrentFenceWaitUnsupported; + private readonly CommandPool _pool; + private readonly Thread _owner; + + public bool OwnedByCurrentThread => _owner == Thread.CurrentThread; + + private struct ReservedCommandBuffer + { + public bool InUse; + public bool InConsumption; + public int SubmissionCount; + public CommandBuffer CommandBuffer; + public FenceHolder Fence; + + public List Dependants; + public List Waitables; + + public void Initialize(Vk api, Device device, CommandPool pool) + { + var allocateInfo = new CommandBufferAllocateInfo + { + SType = StructureType.CommandBufferAllocateInfo, + CommandBufferCount = 1, + CommandPool = pool, + Level = CommandBufferLevel.Primary, + }; + + api.AllocateCommandBuffers(device, in allocateInfo, out CommandBuffer); + + Dependants = new List(); + Waitables = new List(); + } + } + + private readonly ReservedCommandBuffer[] _commandBuffers; + + private readonly int[] _queuedIndexes; + private int _queuedIndexesPtr; + private int _queuedCount; + private int _inUseCount; + + public unsafe CommandBufferPool( + Vk api, + Device device, + Queue queue, + Lock queueLock, + uint queueFamilyIndex, + bool concurrentFenceWaitUnsupported, + bool isLight = false) + { + _api = api; + _device = device; + _queue = queue; + _queueLock = queueLock; + _concurrentFenceWaitUnsupported = concurrentFenceWaitUnsupported; + _owner = Thread.CurrentThread; + + var commandPoolCreateInfo = new CommandPoolCreateInfo + { + SType = StructureType.CommandPoolCreateInfo, + QueueFamilyIndex = queueFamilyIndex, + Flags = CommandPoolCreateFlags.TransientBit | + CommandPoolCreateFlags.ResetCommandBufferBit, + }; + + api.CreateCommandPool(device, in commandPoolCreateInfo, null, out _pool).ThrowOnError(); + + // We need at least 2 command buffers to get texture data in some cases. + _totalCommandBuffers = isLight ? 2 : MaxCommandBuffers; + _totalCommandBuffersMask = _totalCommandBuffers - 1; + + _commandBuffers = new ReservedCommandBuffer[_totalCommandBuffers]; + + _queuedIndexes = new int[_totalCommandBuffers]; + _queuedIndexesPtr = 0; + _queuedCount = 0; + + for (int i = 0; i < _totalCommandBuffers; i++) + { + _commandBuffers[i].Initialize(api, device, _pool); + WaitAndDecrementRef(i); + } + } + + public void AddDependant(int cbIndex, IAuto dependant) + { + dependant.IncrementReferenceCount(); + _commandBuffers[cbIndex].Dependants.Add(dependant); + } + + public void AddWaitable(MultiFenceHolder waitable) + { + lock (_commandBuffers) + { + for (int i = 0; i < _totalCommandBuffers; i++) + { + ref var entry = ref _commandBuffers[i]; + + if (entry.InConsumption) + { + AddWaitable(i, waitable); + } + } + } + } + + public void AddInUseWaitable(MultiFenceHolder waitable) + { + lock (_commandBuffers) + { + for (int i = 0; i < _totalCommandBuffers; i++) + { + ref var entry = ref _commandBuffers[i]; + + if (entry.InUse) + { + AddWaitable(i, waitable); + } + } + } + } + + public void AddWaitable(int cbIndex, MultiFenceHolder waitable) + { + ref var entry = ref _commandBuffers[cbIndex]; + if (waitable.AddFence(cbIndex, entry.Fence)) + { + entry.Waitables.Add(waitable); + } + } + + public bool HasWaitableOnRentedCommandBuffer(MultiFenceHolder waitable, int offset, int size) + { + lock (_commandBuffers) + { + for (int i = 0; i < _totalCommandBuffers; i++) + { + ref var entry = ref _commandBuffers[i]; + + if (entry.InUse && + waitable.HasFence(i) && + waitable.IsBufferRangeInUse(i, offset, size)) + { + return true; + } + } + } + + return false; + } + + public bool IsFenceOnRentedCommandBuffer(FenceHolder fence) + { + lock (_commandBuffers) + { + for (int i = 0; i < _totalCommandBuffers; i++) + { + ref var entry = ref _commandBuffers[i]; + + if (entry.InUse && entry.Fence == fence) + { + return true; + } + } + } + + return false; + } + + public FenceHolder GetFence(int cbIndex) + { + return _commandBuffers[cbIndex].Fence; + } + + public int GetSubmissionCount(int cbIndex) + { + return _commandBuffers[cbIndex].SubmissionCount; + } + + private int FreeConsumed(bool wait) + { + int freeEntry = 0; + + while (_queuedCount > 0) + { + int index = _queuedIndexes[_queuedIndexesPtr]; + + ref var entry = ref _commandBuffers[index]; + + if (wait || !entry.InConsumption || entry.Fence.IsSignaled()) + { + WaitAndDecrementRef(index); + + wait = false; + freeEntry = index; + + _queuedCount--; + _queuedIndexesPtr = (_queuedIndexesPtr + 1) % _totalCommandBuffers; + } + else + { + break; + } + } + + return freeEntry; + } + + public CommandBufferScoped ReturnAndRent(CommandBufferScoped cbs) + { + Return(cbs); + return Rent(); + } + + public CommandBufferScoped Rent() + { + lock (_commandBuffers) + { + int cursor = FreeConsumed(_inUseCount + _queuedCount == _totalCommandBuffers); + + for (int i = 0; i < _totalCommandBuffers; i++) + { + ref var entry = ref _commandBuffers[cursor]; + + if (!entry.InUse && !entry.InConsumption) + { + entry.InUse = true; + + _inUseCount++; + + var commandBufferBeginInfo = new CommandBufferBeginInfo + { + SType = StructureType.CommandBufferBeginInfo, + }; + + _api.BeginCommandBuffer(entry.CommandBuffer, in commandBufferBeginInfo).ThrowOnError(); + + return new CommandBufferScoped(this, entry.CommandBuffer, cursor); + } + + cursor = (cursor + 1) & _totalCommandBuffersMask; + } + } + + throw new InvalidOperationException($"Out of command buffers (In use: {_inUseCount}, queued: {_queuedCount}, total: {_totalCommandBuffers})"); + } + + public void Return(CommandBufferScoped cbs) + { + Return(cbs, null, null, null); + } + + public unsafe void Return( + CommandBufferScoped cbs, + ReadOnlySpan waitSemaphores, + ReadOnlySpan waitDstStageMask, + ReadOnlySpan signalSemaphores) + { + lock (_commandBuffers) + { + int cbIndex = cbs.CommandBufferIndex; + + ref var entry = ref _commandBuffers[cbIndex]; + + Debug.Assert(entry.InUse); + Debug.Assert(entry.CommandBuffer.Handle == cbs.CommandBuffer.Handle); + entry.InUse = false; + entry.InConsumption = true; + entry.SubmissionCount++; + _inUseCount--; + + var commandBuffer = entry.CommandBuffer; + + _api.EndCommandBuffer(commandBuffer).ThrowOnError(); + + fixed (Semaphore* pWaitSemaphores = waitSemaphores, pSignalSemaphores = signalSemaphores) + { + fixed (PipelineStageFlags* pWaitDstStageMask = waitDstStageMask) + { + SubmitInfo sInfo = new() + { + SType = StructureType.SubmitInfo, + WaitSemaphoreCount = !waitSemaphores.IsEmpty ? (uint)waitSemaphores.Length : 0, + PWaitSemaphores = pWaitSemaphores, + PWaitDstStageMask = pWaitDstStageMask, + CommandBufferCount = 1, + PCommandBuffers = &commandBuffer, + SignalSemaphoreCount = !signalSemaphores.IsEmpty ? (uint)signalSemaphores.Length : 0, + PSignalSemaphores = pSignalSemaphores, + }; + + lock (_queueLock) + { + _api.QueueSubmit(_queue, 1, in sInfo, entry.Fence.GetUnsafe()).ThrowOnError(); + } + } + } + + int ptr = (_queuedIndexesPtr + _queuedCount) % _totalCommandBuffers; + _queuedIndexes[ptr] = cbIndex; + _queuedCount++; + } + } + + private void WaitAndDecrementRef(int cbIndex, bool refreshFence = true) + { + ref var entry = ref _commandBuffers[cbIndex]; + + if (entry.InConsumption) + { + entry.Fence.Wait(); + entry.InConsumption = false; + } + + foreach (var dependant in entry.Dependants) + { + dependant.DecrementReferenceCount(cbIndex); + } + + foreach (var waitable in entry.Waitables) + { + waitable.RemoveFence(cbIndex); + waitable.RemoveBufferUses(cbIndex); + } + + entry.Dependants.Clear(); + entry.Waitables.Clear(); + entry.Fence?.Dispose(); + + if (refreshFence) + { + entry.Fence = new FenceHolder(_api, _device, _concurrentFenceWaitUnsupported); + } + else + { + entry.Fence = null; + } + } + + public unsafe void Dispose() + { + for (int i = 0; i < _totalCommandBuffers; i++) + { + WaitAndDecrementRef(i, refreshFence: false); + } + + _api.DestroyCommandPool(_device, _pool, null); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferScoped.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferScoped.cs new file mode 100644 index 000000000..5dd046814 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/CommandBufferScoped.cs @@ -0,0 +1,39 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct CommandBufferScoped : IDisposable + { + private readonly CommandBufferPool _pool; + public CommandBuffer CommandBuffer { get; } + public int CommandBufferIndex { get; } + + public CommandBufferScoped(CommandBufferPool pool, CommandBuffer commandBuffer, int commandBufferIndex) + { + _pool = pool; + CommandBuffer = commandBuffer; + CommandBufferIndex = commandBufferIndex; + } + + public void AddDependant(IAuto dependant) + { + _pool.AddDependant(CommandBufferIndex, dependant); + } + + public void AddWaitable(MultiFenceHolder waitable) + { + _pool.AddWaitable(CommandBufferIndex, waitable); + } + + public FenceHolder GetFence() + { + return _pool.GetFence(CommandBufferIndex); + } + + public void Dispose() + { + _pool?.Return(this); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Constants.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Constants.cs new file mode 100644 index 000000000..2f43d249d --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Constants.cs @@ -0,0 +1,23 @@ +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class Constants + { + public const int MaxVertexAttributes = 32; + public const int MaxVertexBuffers = 32; + public const int MaxTransformFeedbackBuffers = 4; + public const int MaxRenderTargets = 8; + public const int MaxViewports = 16; + public const int MaxShaderStages = 5; + public const int MaxUniformBuffersPerStage = 18; + public const int MaxStorageBuffersPerStage = 16; + public const int MaxTexturesPerStage = 64; + public const int MaxImagesPerStage = 16; + public const int MaxUniformBufferBindings = MaxUniformBuffersPerStage * MaxShaderStages; + public const int MaxStorageBufferBindings = MaxStorageBuffersPerStage * MaxShaderStages; + public const int MaxTextureBindings = MaxTexturesPerStage * MaxShaderStages; + public const int MaxImageBindings = MaxImagesPerStage * MaxShaderStages; + public const int MaxPushDescriptorBinding = 64; + + public const ulong SparseBufferAlignment = 0x10000; + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetCollection.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetCollection.cs new file mode 100644 index 000000000..0ce32dbca --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetCollection.cs @@ -0,0 +1,222 @@ +using Silk.NET.Vulkan; +using System; +using VkBuffer = Silk.NET.Vulkan.Buffer; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + struct DescriptorSetCollection : IDisposable + { + private DescriptorSetManager.DescriptorPoolHolder _holder; + private readonly DescriptorSet[] _descriptorSets; + public readonly int SetsCount => _descriptorSets.Length; + + public DescriptorSetCollection(DescriptorSetManager.DescriptorPoolHolder holder, DescriptorSet[] descriptorSets) + { + _holder = holder; + _descriptorSets = descriptorSets; + } + + public void InitializeBuffers(int setIndex, int baseBinding, int count, DescriptorType type, VkBuffer dummyBuffer) + { + Span infos = stackalloc DescriptorBufferInfo[count]; + + infos.Fill(new DescriptorBufferInfo + { + Buffer = dummyBuffer, + Range = Vk.WholeSize, + }); + + UpdateBuffers(setIndex, baseBinding, infos, type); + } + + public unsafe void UpdateBuffer(int setIndex, int bindingIndex, DescriptorBufferInfo bufferInfo, DescriptorType type) + { + if (bufferInfo.Buffer.Handle != 0UL) + { + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)bindingIndex, + DescriptorType = type, + DescriptorCount = 1, + PBufferInfo = &bufferInfo, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + } + } + + public unsafe void UpdateBuffers(int setIndex, int baseBinding, ReadOnlySpan bufferInfo, DescriptorType type) + { + if (bufferInfo.Length == 0) + { + return; + } + + fixed (DescriptorBufferInfo* pBufferInfo = bufferInfo) + { + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)baseBinding, + DescriptorType = type, + DescriptorCount = (uint)bufferInfo.Length, + PBufferInfo = pBufferInfo, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + } + } + + public unsafe void UpdateImage(int setIndex, int bindingIndex, DescriptorImageInfo imageInfo, DescriptorType type) + { + if (imageInfo.ImageView.Handle != 0UL) + { + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)bindingIndex, + DescriptorType = type, + DescriptorCount = 1, + PImageInfo = &imageInfo, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + } + } + + public unsafe void UpdateImages(int setIndex, int baseBinding, ReadOnlySpan imageInfo, DescriptorType type) + { + if (imageInfo.Length == 0) + { + return; + } + + fixed (DescriptorImageInfo* pImageInfo = imageInfo) + { + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)baseBinding, + DescriptorType = type, + DescriptorCount = (uint)imageInfo.Length, + PImageInfo = pImageInfo, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + } + } + + public unsafe void UpdateImagesCombined(int setIndex, int baseBinding, ReadOnlySpan imageInfo, DescriptorType type) + { + if (imageInfo.Length == 0) + { + return; + } + + fixed (DescriptorImageInfo* pImageInfo = imageInfo) + { + for (int i = 0; i < imageInfo.Length; i++) + { + bool nonNull = imageInfo[i].ImageView.Handle != 0 && imageInfo[i].Sampler.Handle != 0; + if (nonNull) + { + int count = 1; + + while (i + count < imageInfo.Length && + imageInfo[i + count].ImageView.Handle != 0 && + imageInfo[i + count].Sampler.Handle != 0) + { + count++; + } + + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)(baseBinding + i), + DescriptorType = DescriptorType.CombinedImageSampler, + DescriptorCount = (uint)count, + PImageInfo = pImageInfo, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + + i += count - 1; + } + } + } + } + + public unsafe void UpdateBufferImage(int setIndex, int bindingIndex, BufferView texelBufferView, DescriptorType type) + { + if (texelBufferView.Handle != 0UL) + { + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)bindingIndex, + DescriptorType = type, + DescriptorCount = 1, + PTexelBufferView = &texelBufferView, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + } + } + + public unsafe void UpdateBufferImages(int setIndex, int baseBinding, ReadOnlySpan texelBufferView, DescriptorType type) + { + if (texelBufferView.Length == 0) + { + return; + } + + fixed (BufferView* pTexelBufferView = texelBufferView) + { + for (uint i = 0; i < texelBufferView.Length;) + { + uint count = 1; + + if (texelBufferView[(int)i].Handle != 0UL) + { + while (i + count < texelBufferView.Length && texelBufferView[(int)(i + count)].Handle != 0UL) + { + count++; + } + + var writeDescriptorSet = new WriteDescriptorSet + { + SType = StructureType.WriteDescriptorSet, + DstSet = _descriptorSets[setIndex], + DstBinding = (uint)baseBinding + i, + DescriptorType = type, + DescriptorCount = count, + PTexelBufferView = pTexelBufferView + i, + }; + + _holder.Api.UpdateDescriptorSets(_holder.Device, 1, in writeDescriptorSet, 0, null); + } + + i += count; + } + } + } + + public readonly DescriptorSet[] GetSets() + { + return _descriptorSets; + } + + public void Dispose() + { + _holder?.FreeDescriptorSets(this); + _holder = null; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetManager.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetManager.cs new file mode 100644 index 000000000..32d2976b3 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetManager.cs @@ -0,0 +1,231 @@ +using Silk.NET.Vulkan; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class DescriptorSetManager : IDisposable + { + public const uint MaxSets = 8; + + public class DescriptorPoolHolder : IDisposable + { + public Vk Api { get; } + public Device Device { get; } + + private readonly DescriptorPool _pool; + private int _freeDescriptors; + private int _totalSets; + private int _setsInUse; + private bool _done; + + public unsafe DescriptorPoolHolder(Vk api, Device device, ReadOnlySpan poolSizes, bool updateAfterBind) + { + Api = api; + Device = device; + + foreach (var poolSize in poolSizes) + { + _freeDescriptors += (int)poolSize.DescriptorCount; + } + + fixed (DescriptorPoolSize* pPoolsSize = poolSizes) + { + var descriptorPoolCreateInfo = new DescriptorPoolCreateInfo + { + SType = StructureType.DescriptorPoolCreateInfo, + Flags = updateAfterBind ? DescriptorPoolCreateFlags.UpdateAfterBindBit : DescriptorPoolCreateFlags.None, + MaxSets = MaxSets, + PoolSizeCount = (uint)poolSizes.Length, + PPoolSizes = pPoolsSize, + }; + + Api.CreateDescriptorPool(device, in descriptorPoolCreateInfo, null, out _pool).ThrowOnError(); + } + } + + public unsafe DescriptorSetCollection AllocateDescriptorSets(ReadOnlySpan layouts, int consumedDescriptors) + { + TryAllocateDescriptorSets(layouts, consumedDescriptors, isTry: false, out var dsc); + return dsc; + } + + public bool TryAllocateDescriptorSets(ReadOnlySpan layouts, int consumedDescriptors, out DescriptorSetCollection dsc) + { + return TryAllocateDescriptorSets(layouts, consumedDescriptors, isTry: true, out dsc); + } + + private unsafe bool TryAllocateDescriptorSets( + ReadOnlySpan layouts, + int consumedDescriptors, + bool isTry, + out DescriptorSetCollection dsc) + { + Debug.Assert(!_done); + + DescriptorSet[] descriptorSets = new DescriptorSet[layouts.Length]; + + fixed (DescriptorSet* pDescriptorSets = descriptorSets) + { + fixed (DescriptorSetLayout* pLayouts = layouts) + { + var descriptorSetAllocateInfo = new DescriptorSetAllocateInfo + { + SType = StructureType.DescriptorSetAllocateInfo, + DescriptorPool = _pool, + DescriptorSetCount = (uint)layouts.Length, + PSetLayouts = pLayouts, + }; + + var result = Api.AllocateDescriptorSets(Device, &descriptorSetAllocateInfo, pDescriptorSets); + if (isTry && result == Result.ErrorOutOfPoolMemory) + { + _totalSets = (int)MaxSets; + _done = true; + DestroyIfDone(); + dsc = default; + return false; + } + + result.ThrowOnError(); + } + } + + _freeDescriptors -= consumedDescriptors; + _totalSets += layouts.Length; + _setsInUse += layouts.Length; + + dsc = new DescriptorSetCollection(this, descriptorSets); + return true; + } + + public void FreeDescriptorSets(DescriptorSetCollection dsc) + { + _setsInUse -= dsc.SetsCount; + Debug.Assert(_setsInUse >= 0); + DestroyIfDone(); + } + + public bool CanFit(int setsCount, int descriptorsCount) + { + // Try to determine if an allocation with the given parameters will succeed. + // An allocation may fail if the sets count or descriptors count exceeds the available counts + // of the pool. + // Not getting that right is not fatal, it will just create a new pool and try again, + // but it is less efficient. + + if (_totalSets + setsCount <= MaxSets && _freeDescriptors >= descriptorsCount) + { + return true; + } + + _done = true; + DestroyIfDone(); + return false; + } + + private unsafe void DestroyIfDone() + { + if (_done && _setsInUse == 0) + { + Api.DestroyDescriptorPool(Device, _pool, null); + } + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + unsafe + { + Api.DestroyDescriptorPool(Device, _pool, null); + } + } + } + + public void Dispose() + { + GC.SuppressFinalize(this); + Dispose(true); + } + } + + private readonly Device _device; + private readonly DescriptorPoolHolder[] _currentPools; + + public DescriptorSetManager(Device device, int poolCount) + { + _device = device; + _currentPools = new DescriptorPoolHolder[poolCount]; + } + + public Auto AllocateDescriptorSet( + Vk api, + DescriptorSetLayout layout, + ReadOnlySpan poolSizes, + int poolIndex, + int consumedDescriptors, + bool updateAfterBind) + { + Span layouts = stackalloc DescriptorSetLayout[1]; + layouts[0] = layout; + return AllocateDescriptorSets(api, layouts, poolSizes, poolIndex, consumedDescriptors, updateAfterBind); + } + + public Auto AllocateDescriptorSets( + Vk api, + ReadOnlySpan layouts, + ReadOnlySpan poolSizes, + int poolIndex, + int consumedDescriptors, + bool updateAfterBind) + { + // If we fail the first time, just create a new pool and try again. + + var pool = GetPool(api, poolSizes, poolIndex, layouts.Length, consumedDescriptors, updateAfterBind); + if (!pool.TryAllocateDescriptorSets(layouts, consumedDescriptors, out var dsc)) + { + pool = GetPool(api, poolSizes, poolIndex, layouts.Length, consumedDescriptors, updateAfterBind); + dsc = pool.AllocateDescriptorSets(layouts, consumedDescriptors); + } + + return new Auto(dsc); + } + + private DescriptorPoolHolder GetPool( + Vk api, + ReadOnlySpan poolSizes, + int poolIndex, + int setsCount, + int descriptorsCount, + bool updateAfterBind) + { + ref DescriptorPoolHolder currentPool = ref _currentPools[poolIndex]; + + if (currentPool == null || !currentPool.CanFit(setsCount, descriptorsCount)) + { + currentPool = new DescriptorPoolHolder(api, _device, poolSizes, updateAfterBind); + } + + return currentPool; + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + for (int index = 0; index < _currentPools.Length; index++) + { + _currentPools[index]?.Dispose(); + _currentPools[index] = null; + } + } + } + + public void Dispose() + { + GC.SuppressFinalize(this); + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplate.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplate.cs new file mode 100644 index 000000000..b32139672 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplate.cs @@ -0,0 +1,210 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Numerics; +using System.Runtime.CompilerServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class DescriptorSetTemplate : IDisposable + { + /// + /// Renderdoc seems to crash when doing a templated uniform update with count > 1 on a push descriptor. + /// When this is true, consecutive buffers are always updated individually. + /// + private const bool RenderdocPushCountBug = true; + + private readonly VulkanRenderer _gd; + private readonly Device _device; + + public readonly DescriptorUpdateTemplate Template; + public readonly int Size; + + public unsafe DescriptorSetTemplate( + VulkanRenderer gd, + Device device, + ResourceBindingSegment[] segments, + PipelineLayoutCacheEntry plce, + PipelineBindPoint pbp, + int setIndex) + { + _gd = gd; + _device = device; + + // Create a template from the set usages. Assumes the descriptor set is updated in segment order then binding order. + + DescriptorUpdateTemplateEntry* entries = stackalloc DescriptorUpdateTemplateEntry[segments.Length]; + nuint structureOffset = 0; + + for (int seg = 0; seg < segments.Length; seg++) + { + ResourceBindingSegment segment = segments[seg]; + + int binding = segment.Binding; + int count = segment.Count; + + if (IsBufferType(segment.Type)) + { + entries[seg] = new DescriptorUpdateTemplateEntry() + { + DescriptorType = segment.Type.Convert(), + DstBinding = (uint)binding, + DescriptorCount = (uint)count, + Offset = structureOffset, + Stride = (nuint)Unsafe.SizeOf() + }; + + structureOffset += (nuint)(Unsafe.SizeOf() * count); + } + else if (IsBufferTextureType(segment.Type)) + { + entries[seg] = new DescriptorUpdateTemplateEntry() + { + DescriptorType = segment.Type.Convert(), + DstBinding = (uint)binding, + DescriptorCount = (uint)count, + Offset = structureOffset, + Stride = (nuint)Unsafe.SizeOf() + }; + + structureOffset += (nuint)(Unsafe.SizeOf() * count); + } + else + { + entries[seg] = new DescriptorUpdateTemplateEntry() + { + DescriptorType = segment.Type.Convert(), + DstBinding = (uint)binding, + DescriptorCount = (uint)count, + Offset = structureOffset, + Stride = (nuint)Unsafe.SizeOf() + }; + + structureOffset += (nuint)(Unsafe.SizeOf() * count); + } + } + + Size = (int)structureOffset; + + var info = new DescriptorUpdateTemplateCreateInfo() + { + SType = StructureType.DescriptorUpdateTemplateCreateInfo, + DescriptorUpdateEntryCount = (uint)segments.Length, + PDescriptorUpdateEntries = entries, + + TemplateType = DescriptorUpdateTemplateType.DescriptorSet, + DescriptorSetLayout = plce.DescriptorSetLayouts[setIndex], + PipelineBindPoint = pbp, + PipelineLayout = plce.PipelineLayout, + Set = (uint)setIndex, + }; + + DescriptorUpdateTemplate result; + gd.Api.CreateDescriptorUpdateTemplate(device, &info, null, &result).ThrowOnError(); + + Template = result; + } + + public unsafe DescriptorSetTemplate( + VulkanRenderer gd, + Device device, + ResourceDescriptorCollection descriptors, + long updateMask, + PipelineLayoutCacheEntry plce, + PipelineBindPoint pbp, + int setIndex) + { + _gd = gd; + _device = device; + + // Create a template from the set usages. Assumes the descriptor set is updated in segment order then binding order. + int segmentCount = BitOperations.PopCount((ulong)updateMask); + + DescriptorUpdateTemplateEntry* entries = stackalloc DescriptorUpdateTemplateEntry[segmentCount]; + int entry = 0; + nuint structureOffset = 0; + + void AddBinding(int binding, int count) + { + entries[entry++] = new DescriptorUpdateTemplateEntry() + { + DescriptorType = DescriptorType.UniformBuffer, + DstBinding = (uint)binding, + DescriptorCount = (uint)count, + Offset = structureOffset, + Stride = (nuint)Unsafe.SizeOf() + }; + + structureOffset += (nuint)(Unsafe.SizeOf() * count); + } + + int startBinding = 0; + int bindingCount = 0; + + foreach (ResourceDescriptor descriptor in descriptors.Descriptors) + { + for (int i = 0; i < descriptor.Count; i++) + { + int binding = descriptor.Binding + i; + + if ((updateMask & (1L << binding)) != 0) + { + if (bindingCount > 0 && (RenderdocPushCountBug || startBinding + bindingCount != binding)) + { + AddBinding(startBinding, bindingCount); + + bindingCount = 0; + } + + if (bindingCount == 0) + { + startBinding = binding; + } + + bindingCount++; + } + } + } + + if (bindingCount > 0) + { + AddBinding(startBinding, bindingCount); + } + + Size = (int)structureOffset; + + var info = new DescriptorUpdateTemplateCreateInfo() + { + SType = StructureType.DescriptorUpdateTemplateCreateInfo, + DescriptorUpdateEntryCount = (uint)entry, + PDescriptorUpdateEntries = entries, + + TemplateType = DescriptorUpdateTemplateType.PushDescriptorsKhr, + DescriptorSetLayout = plce.DescriptorSetLayouts[setIndex], + PipelineBindPoint = pbp, + PipelineLayout = plce.PipelineLayout, + Set = (uint)setIndex, + }; + + DescriptorUpdateTemplate result; + gd.Api.CreateDescriptorUpdateTemplate(device, &info, null, &result).ThrowOnError(); + + Template = result; + } + + private static bool IsBufferType(ResourceType type) + { + return type == ResourceType.UniformBuffer || type == ResourceType.StorageBuffer; + } + + private static bool IsBufferTextureType(ResourceType type) + { + return type == ResourceType.BufferTexture || type == ResourceType.BufferImage; + } + + public unsafe void Dispose() + { + _gd.Api.DestroyDescriptorUpdateTemplate(_device, Template, null); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplateUpdater.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplateUpdater.cs new file mode 100644 index 000000000..3470ec072 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetTemplateUpdater.cs @@ -0,0 +1,77 @@ +using Ryujinx.Common; +using Silk.NET.Vulkan; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + ref struct DescriptorSetTemplateWriter + { + private Span _data; + + public DescriptorSetTemplateWriter(Span data) + { + _data = data; + } + + public void Push(ReadOnlySpan values) where T : unmanaged + { + Span target = MemoryMarshal.Cast(_data); + + values.CopyTo(target); + + _data = _data[(Unsafe.SizeOf() * values.Length)..]; + } + } + + unsafe class DescriptorSetTemplateUpdater : IDisposable + { + private const int SizeGranularity = 512; + + private DescriptorSetTemplate _activeTemplate; + private NativeArray _data; + + private void EnsureSize(int size) + { + if (_data == null || _data.Length < size) + { + _data?.Dispose(); + + int dataSize = BitUtils.AlignUp(size, SizeGranularity); + _data = new NativeArray(dataSize); + } + } + + public DescriptorSetTemplateWriter Begin(DescriptorSetTemplate template) + { + _activeTemplate = template; + + EnsureSize(template.Size); + + return new DescriptorSetTemplateWriter(new Span(_data.Pointer, template.Size)); + } + + public DescriptorSetTemplateWriter Begin(int maxSize) + { + EnsureSize(maxSize); + + return new DescriptorSetTemplateWriter(new Span(_data.Pointer, maxSize)); + } + + public void Commit(VulkanRenderer gd, Device device, DescriptorSet set) + { + gd.Api.UpdateDescriptorSetWithTemplate(device, set, _activeTemplate.Template, _data.Pointer); + } + + public void CommitPushDescriptor(VulkanRenderer gd, CommandBufferScoped cbs, DescriptorSetTemplate template, PipelineLayout layout) + { + gd.PushDescriptorApi.CmdPushDescriptorSetWithTemplate(cbs.CommandBuffer, template.Template, layout, 0, _data.Pointer); + } + + public void Dispose() + { + _data?.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetUpdater.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetUpdater.cs new file mode 100644 index 000000000..ed7098a93 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DescriptorSetUpdater.cs @@ -0,0 +1,1190 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Silk.NET.Vulkan; +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using CompareOp = Ryujinx.Graphics.GAL.CompareOp; +using Format = Ryujinx.Graphics.GAL.Format; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class DescriptorSetUpdater + { + private const ulong StorageBufferMaxMirrorable = 0x2000; + + private const int ArrayGrowthSize = 16; + + private record struct BufferRef + { + public Auto Buffer; + public int Offset; + public bool Write; + + public BufferRef(Auto buffer) + { + Buffer = buffer; + Offset = 0; + Write = true; + } + + public BufferRef(Auto buffer, ref BufferRange range) + { + Buffer = buffer; + Offset = range.Offset; + Write = range.Write; + } + } + + private record struct TextureRef + { + public ShaderStage Stage; + public TextureView View; + public Auto ImageView; + public Auto Sampler; + + public TextureRef(ShaderStage stage, TextureView view, Auto imageView, Auto sampler) + { + Stage = stage; + View = view; + ImageView = imageView; + Sampler = sampler; + } + } + + private record struct ImageRef + { + public ShaderStage Stage; + public TextureView View; + public Auto ImageView; + + public ImageRef(ShaderStage stage, TextureView view, Auto imageView) + { + Stage = stage; + View = view; + ImageView = imageView; + } + } + + private readonly record struct ArrayRef(ShaderStage Stage, T Array); + + private readonly VulkanRenderer _gd; + private readonly Device _device; + private ShaderCollection _program; + + private readonly BufferRef[] _uniformBufferRefs; + private readonly BufferRef[] _storageBufferRefs; + private readonly TextureRef[] _textureRefs; + private readonly ImageRef[] _imageRefs; + private readonly TextureBuffer[] _bufferTextureRefs; + private readonly TextureBuffer[] _bufferImageRefs; + + private ArrayRef[] _textureArrayRefs; + private ArrayRef[] _imageArrayRefs; + + private ArrayRef[] _textureArrayExtraRefs; + private ArrayRef[] _imageArrayExtraRefs; + + private readonly DescriptorBufferInfo[] _uniformBuffers; + private readonly DescriptorBufferInfo[] _storageBuffers; + private readonly DescriptorImageInfo[] _textures; + private readonly DescriptorImageInfo[] _images; + private readonly BufferView[] _bufferTextures; + private readonly BufferView[] _bufferImages; + + private readonly DescriptorSetTemplateUpdater _templateUpdater; + + private BitMapStruct> _uniformSet; + private BitMapStruct> _storageSet; + private BitMapStruct> _uniformMirrored; + private BitMapStruct> _storageMirrored; + private readonly int[] _uniformSetPd; + private int _pdSequence = 1; + + private bool _updateDescriptorCacheCbIndex; + + [Flags] + private enum DirtyFlags + { + None = 0, + Uniform = 1 << 0, + Storage = 1 << 1, + Texture = 1 << 2, + Image = 1 << 3, + All = Uniform | Storage | Texture | Image, + } + + private DirtyFlags _dirty; + + private readonly BufferHolder _dummyBuffer; + private readonly TextureView _dummyTexture; + private readonly SamplerHolder _dummySampler; + + public List FeedbackLoopHazards { get; private set; } + + public DescriptorSetUpdater(VulkanRenderer gd, Device device) + { + _gd = gd; + _device = device; + + // Some of the bindings counts needs to be multiplied by 2 because we have buffer and + // regular textures/images interleaved on the same descriptor set. + + _uniformBufferRefs = new BufferRef[Constants.MaxUniformBufferBindings]; + _storageBufferRefs = new BufferRef[Constants.MaxStorageBufferBindings]; + _textureRefs = new TextureRef[Constants.MaxTextureBindings * 2]; + _imageRefs = new ImageRef[Constants.MaxImageBindings * 2]; + _bufferTextureRefs = new TextureBuffer[Constants.MaxTextureBindings * 2]; + _bufferImageRefs = new TextureBuffer[Constants.MaxImageBindings * 2]; + + _textureArrayRefs = Array.Empty>(); + _imageArrayRefs = Array.Empty>(); + + _textureArrayExtraRefs = Array.Empty>(); + _imageArrayExtraRefs = Array.Empty>(); + + _uniformBuffers = new DescriptorBufferInfo[Constants.MaxUniformBufferBindings]; + _storageBuffers = new DescriptorBufferInfo[Constants.MaxStorageBufferBindings]; + _textures = new DescriptorImageInfo[Constants.MaxTexturesPerStage]; + _images = new DescriptorImageInfo[Constants.MaxImagesPerStage]; + _bufferTextures = new BufferView[Constants.MaxTexturesPerStage]; + _bufferImages = new BufferView[Constants.MaxImagesPerStage]; + + _uniformSetPd = new int[Constants.MaxUniformBufferBindings]; + + var initialImageInfo = new DescriptorImageInfo + { + ImageLayout = ImageLayout.General, + }; + + _textures.AsSpan().Fill(initialImageInfo); + _images.AsSpan().Fill(initialImageInfo); + + if (gd.Capabilities.SupportsNullDescriptors) + { + // If null descriptors are supported, we can pass null as the handle. + _dummyBuffer = null; + } + else + { + // If null descriptors are not supported, we need to pass the handle of a dummy buffer on unused bindings. + _dummyBuffer = gd.BufferManager.Create(gd, 0x10000, forConditionalRendering: false, baseType: BufferAllocationType.DeviceLocal); + } + + _dummyTexture = gd.CreateTextureView(new TextureCreateInfo( + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 4, + Format.R8G8B8A8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha)); + + _dummySampler = (SamplerHolder)gd.CreateSampler(new SamplerCreateInfo( + MinFilter.Nearest, + MagFilter.Nearest, + false, + AddressMode.Repeat, + AddressMode.Repeat, + AddressMode.Repeat, + CompareMode.None, + CompareOp.Always, + new ColorF(0, 0, 0, 0), + 0, + 0, + 0, + 1f)); + + _templateUpdater = new(); + } + + public void Initialize(bool isMainPipeline) + { + MemoryOwner dummyTextureData = MemoryOwner.RentCleared(4); + _dummyTexture.SetData(dummyTextureData); + + if (isMainPipeline) + { + FeedbackLoopHazards = new(); + } + } + + private static bool BindingOverlaps(ref DescriptorBufferInfo info, int bindingOffset, int offset, int size) + { + return offset < bindingOffset + (int)info.Range && (offset + size) > bindingOffset; + } + + internal void Rebind(Auto buffer, int offset, int size) + { + if (_program == null) + { + return; + } + + // Check stage bindings + + _uniformMirrored.Union(_uniformSet).SignalSet((int binding, int count) => + { + for (int i = 0; i < count; i++) + { + ref BufferRef bufferRef = ref _uniformBufferRefs[binding]; + if (bufferRef.Buffer == buffer) + { + ref DescriptorBufferInfo info = ref _uniformBuffers[binding]; + int bindingOffset = bufferRef.Offset; + + if (BindingOverlaps(ref info, bindingOffset, offset, size)) + { + _uniformSet.Clear(binding); + _uniformSetPd[binding] = 0; + SignalDirty(DirtyFlags.Uniform); + } + } + + binding++; + } + }); + + _storageMirrored.Union(_storageSet).SignalSet((int binding, int count) => + { + for (int i = 0; i < count; i++) + { + ref BufferRef bufferRef = ref _storageBufferRefs[binding]; + if (bufferRef.Buffer == buffer) + { + ref DescriptorBufferInfo info = ref _storageBuffers[binding]; + int bindingOffset = bufferRef.Offset; + + if (BindingOverlaps(ref info, bindingOffset, offset, size)) + { + _storageSet.Clear(binding); + SignalDirty(DirtyFlags.Storage); + } + } + + binding++; + } + }); + } + + public void InsertBindingBarriers(CommandBufferScoped cbs) + { + if ((FeedbackLoopHazards?.Count ?? 0) > 0) + { + // Clear existing hazards - they will be rebuilt. + + foreach (TextureView hazard in FeedbackLoopHazards) + { + hazard.DecrementHazardUses(); + } + + FeedbackLoopHazards.Clear(); + } + + foreach (ResourceBindingSegment segment in _program.BindingSegments[PipelineBase.TextureSetIndex]) + { + if (segment.Type == ResourceType.TextureAndSampler) + { + if (!segment.IsArray) + { + for (int i = 0; i < segment.Count; i++) + { + ref var texture = ref _textureRefs[segment.Binding + i]; + texture.View?.PrepareForUsage(cbs, texture.Stage.ConvertToPipelineStageFlags(), FeedbackLoopHazards); + } + } + else + { + ref var arrayRef = ref _textureArrayRefs[segment.Binding]; + PipelineStageFlags stageFlags = arrayRef.Stage.ConvertToPipelineStageFlags(); + arrayRef.Array?.QueueWriteToReadBarriers(cbs, stageFlags); + } + } + } + + foreach (ResourceBindingSegment segment in _program.BindingSegments[PipelineBase.ImageSetIndex]) + { + if (segment.Type == ResourceType.Image) + { + if (!segment.IsArray) + { + for (int i = 0; i < segment.Count; i++) + { + ref var image = ref _imageRefs[segment.Binding + i]; + image.View?.PrepareForUsage(cbs, image.Stage.ConvertToPipelineStageFlags(), FeedbackLoopHazards); + } + } + else + { + ref var arrayRef = ref _imageArrayRefs[segment.Binding]; + PipelineStageFlags stageFlags = arrayRef.Stage.ConvertToPipelineStageFlags(); + arrayRef.Array?.QueueWriteToReadBarriers(cbs, stageFlags); + } + } + } + + for (int setIndex = PipelineBase.DescriptorSetLayouts; setIndex < _program.BindingSegments.Length; setIndex++) + { + var bindingSegments = _program.BindingSegments[setIndex]; + + if (bindingSegments.Length == 0) + { + continue; + } + + ResourceBindingSegment segment = bindingSegments[0]; + + if (segment.IsArray) + { + if (segment.Type == ResourceType.Texture || + segment.Type == ResourceType.Sampler || + segment.Type == ResourceType.TextureAndSampler || + segment.Type == ResourceType.BufferTexture) + { + ref var arrayRef = ref _textureArrayExtraRefs[setIndex - PipelineBase.DescriptorSetLayouts]; + PipelineStageFlags stageFlags = arrayRef.Stage.ConvertToPipelineStageFlags(); + arrayRef.Array?.QueueWriteToReadBarriers(cbs, stageFlags); + } + else if (segment.Type == ResourceType.Image || segment.Type == ResourceType.BufferImage) + { + ref var arrayRef = ref _imageArrayExtraRefs[setIndex - PipelineBase.DescriptorSetLayouts]; + PipelineStageFlags stageFlags = arrayRef.Stage.ConvertToPipelineStageFlags(); + arrayRef.Array?.QueueWriteToReadBarriers(cbs, stageFlags); + } + } + } + } + + public void AdvancePdSequence() + { + if (++_pdSequence == 0) + { + _pdSequence = 1; + } + } + + public void SetProgram(CommandBufferScoped cbs, ShaderCollection program, bool isBound) + { + if (!program.HasSameLayout(_program)) + { + // When the pipeline layout changes, push descriptor bindings are invalidated. + + AdvancePdSequence(); + } + + _program = program; + _updateDescriptorCacheCbIndex = true; + _dirty = DirtyFlags.All; + } + + public void SetImage(CommandBufferScoped cbs, ShaderStage stage, int binding, ITexture image) + { + if (image is TextureBuffer imageBuffer) + { + _bufferImageRefs[binding] = imageBuffer; + } + else if (image is TextureView view) + { + ref ImageRef iRef = ref _imageRefs[binding]; + + iRef.View?.ClearUsage(FeedbackLoopHazards); + view?.PrepareForUsage(cbs, stage.ConvertToPipelineStageFlags(), FeedbackLoopHazards); + + iRef = new(stage, view, view.GetIdentityImageView()); + } + else + { + _imageRefs[binding] = default; + _bufferImageRefs[binding] = null; + } + + SignalDirty(DirtyFlags.Image); + } + + public void SetImage(int binding, Auto image) + { + _imageRefs[binding] = new(ShaderStage.Compute, null, image); + + SignalDirty(DirtyFlags.Image); + } + + public void SetStorageBuffers(CommandBuffer commandBuffer, ReadOnlySpan buffers) + { + for (int i = 0; i < buffers.Length; i++) + { + var assignment = buffers[i]; + var buffer = assignment.Range; + int index = assignment.Binding; + + Auto vkBuffer = buffer.Handle == BufferHandle.Null + ? null + : _gd.BufferManager.GetBuffer(commandBuffer, buffer.Handle, buffer.Write, isSSBO: true); + + ref BufferRef currentBufferRef = ref _storageBufferRefs[index]; + + DescriptorBufferInfo info = new() + { + Offset = (ulong)buffer.Offset, + Range = (ulong)buffer.Size, + }; + + var newRef = new BufferRef(vkBuffer, ref buffer); + + ref DescriptorBufferInfo currentInfo = ref _storageBuffers[index]; + + if (!currentBufferRef.Equals(newRef) || currentInfo.Range != info.Range) + { + _storageSet.Clear(index); + + currentInfo = info; + currentBufferRef = newRef; + } + } + + SignalDirty(DirtyFlags.Storage); + } + + public void SetStorageBuffers(CommandBuffer commandBuffer, int first, ReadOnlySpan> buffers) + { + for (int i = 0; i < buffers.Length; i++) + { + var vkBuffer = buffers[i]; + int index = first + i; + + ref BufferRef currentBufferRef = ref _storageBufferRefs[index]; + + DescriptorBufferInfo info = new() + { + Offset = 0, + Range = Vk.WholeSize, + }; + + BufferRef newRef = new(vkBuffer); + + ref DescriptorBufferInfo currentInfo = ref _storageBuffers[index]; + + if (!currentBufferRef.Equals(newRef) || currentInfo.Range != info.Range) + { + _storageSet.Clear(index); + + currentInfo = info; + currentBufferRef = newRef; + } + } + + SignalDirty(DirtyFlags.Storage); + } + + public void SetTextureAndSampler( + CommandBufferScoped cbs, + ShaderStage stage, + int binding, + ITexture texture, + ISampler sampler) + { + if (texture is TextureBuffer textureBuffer) + { + _bufferTextureRefs[binding] = textureBuffer; + } + else if (texture is TextureView view) + { + ref TextureRef iRef = ref _textureRefs[binding]; + + iRef.View?.ClearUsage(FeedbackLoopHazards); + view?.PrepareForUsage(cbs, stage.ConvertToPipelineStageFlags(), FeedbackLoopHazards); + + iRef = new(stage, view, view.GetImageView(), ((SamplerHolder)sampler)?.GetSampler()); + } + else + { + _textureRefs[binding] = default; + _bufferTextureRefs[binding] = null; + } + + SignalDirty(DirtyFlags.Texture); + } + + public void SetTextureAndSamplerIdentitySwizzle( + CommandBufferScoped cbs, + ShaderStage stage, + int binding, + ITexture texture, + ISampler sampler) + { + if (texture is TextureView view) + { + view.Storage.QueueWriteToReadBarrier(cbs, AccessFlags.ShaderReadBit, stage.ConvertToPipelineStageFlags()); + + _textureRefs[binding] = new(stage, view, view.GetIdentityImageView(), ((SamplerHolder)sampler)?.GetSampler()); + + SignalDirty(DirtyFlags.Texture); + } + else + { + SetTextureAndSampler(cbs, stage, binding, texture, sampler); + } + } + + public void SetTextureArray(CommandBufferScoped cbs, ShaderStage stage, int binding, ITextureArray array) + { + ref ArrayRef arrayRef = ref GetArrayRef(ref _textureArrayRefs, binding, ArrayGrowthSize); + + if (arrayRef.Stage != stage || arrayRef.Array != array) + { + arrayRef.Array?.DecrementBindCount(); + + if (array is TextureArray textureArray) + { + textureArray.IncrementBindCount(); + textureArray.QueueWriteToReadBarriers(cbs, stage.ConvertToPipelineStageFlags()); + } + + arrayRef = new ArrayRef(stage, array as TextureArray); + + SignalDirty(DirtyFlags.Texture); + } + } + + public void SetTextureArraySeparate(CommandBufferScoped cbs, ShaderStage stage, int setIndex, ITextureArray array) + { + ref ArrayRef arrayRef = ref GetArrayRef(ref _textureArrayExtraRefs, setIndex - PipelineBase.DescriptorSetLayouts); + + if (arrayRef.Stage != stage || arrayRef.Array != array) + { + arrayRef.Array?.DecrementBindCount(); + + if (array is TextureArray textureArray) + { + textureArray.IncrementBindCount(); + textureArray.QueueWriteToReadBarriers(cbs, stage.ConvertToPipelineStageFlags()); + } + + arrayRef = new ArrayRef(stage, array as TextureArray); + + SignalDirty(DirtyFlags.Texture); + } + } + + public void SetImageArray(CommandBufferScoped cbs, ShaderStage stage, int binding, IImageArray array) + { + ref ArrayRef arrayRef = ref GetArrayRef(ref _imageArrayRefs, binding, ArrayGrowthSize); + + if (arrayRef.Stage != stage || arrayRef.Array != array) + { + arrayRef.Array?.DecrementBindCount(); + + if (array is ImageArray imageArray) + { + imageArray.IncrementBindCount(); + imageArray.QueueWriteToReadBarriers(cbs, stage.ConvertToPipelineStageFlags()); + } + + arrayRef = new ArrayRef(stage, array as ImageArray); + + SignalDirty(DirtyFlags.Image); + } + } + + public void SetImageArraySeparate(CommandBufferScoped cbs, ShaderStage stage, int setIndex, IImageArray array) + { + ref ArrayRef arrayRef = ref GetArrayRef(ref _imageArrayExtraRefs, setIndex - PipelineBase.DescriptorSetLayouts); + + if (arrayRef.Stage != stage || arrayRef.Array != array) + { + arrayRef.Array?.DecrementBindCount(); + + if (array is ImageArray imageArray) + { + imageArray.IncrementBindCount(); + imageArray.QueueWriteToReadBarriers(cbs, stage.ConvertToPipelineStageFlags()); + } + + arrayRef = new ArrayRef(stage, array as ImageArray); + + SignalDirty(DirtyFlags.Image); + } + } + + private static ref ArrayRef GetArrayRef(ref ArrayRef[] array, int index, int growthSize = 1) + { + ArgumentOutOfRangeException.ThrowIfNegative(index); + + if (array.Length <= index) + { + Array.Resize(ref array, index + growthSize); + } + + return ref array[index]; + } + + public void SetUniformBuffers(CommandBuffer commandBuffer, ReadOnlySpan buffers) + { + for (int i = 0; i < buffers.Length; i++) + { + var assignment = buffers[i]; + var buffer = assignment.Range; + int index = assignment.Binding; + + Auto vkBuffer = buffer.Handle == BufferHandle.Null + ? null + : _gd.BufferManager.GetBuffer(commandBuffer, buffer.Handle, false); + + ref BufferRef currentBufferRef = ref _uniformBufferRefs[index]; + + DescriptorBufferInfo info = new() + { + Offset = (ulong)buffer.Offset, + Range = (ulong)buffer.Size, + }; + + BufferRef newRef = new(vkBuffer, ref buffer); + + ref DescriptorBufferInfo currentInfo = ref _uniformBuffers[index]; + + if (!currentBufferRef.Equals(newRef) || currentInfo.Range != info.Range) + { + _uniformSet.Clear(index); + _uniformSetPd[index] = 0; + + currentInfo = info; + currentBufferRef = newRef; + } + } + + SignalDirty(DirtyFlags.Uniform); + } + + private void SignalDirty(DirtyFlags flag) + { + _dirty |= flag; + } + + public void UpdateAndBindDescriptorSets(CommandBufferScoped cbs, PipelineBindPoint pbp) + { + if ((_dirty & DirtyFlags.All) == 0) + { + return; + } + + var program = _program; + + if (_dirty.HasFlag(DirtyFlags.Uniform)) + { + if (program.UsePushDescriptors) + { + UpdateAndBindUniformBufferPd(cbs); + } + else + { + UpdateAndBind(cbs, program, PipelineBase.UniformSetIndex, pbp); + } + } + + if (_dirty.HasFlag(DirtyFlags.Storage)) + { + UpdateAndBind(cbs, program, PipelineBase.StorageSetIndex, pbp); + } + + if (_dirty.HasFlag(DirtyFlags.Texture)) + { + if (program.UpdateTexturesWithoutTemplate) + { + UpdateAndBindTexturesWithoutTemplate(cbs, program, pbp); + } + else + { + UpdateAndBind(cbs, program, PipelineBase.TextureSetIndex, pbp); + } + } + + if (_dirty.HasFlag(DirtyFlags.Image)) + { + UpdateAndBind(cbs, program, PipelineBase.ImageSetIndex, pbp); + } + + if (program.BindingSegments.Length > PipelineBase.DescriptorSetLayouts) + { + // Program is using extra sets, we need to bind those too. + + BindExtraSets(cbs, program, pbp); + } + + _dirty = DirtyFlags.None; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool UpdateBuffer( + CommandBufferScoped cbs, + ref DescriptorBufferInfo info, + ref BufferRef buffer, + Auto dummyBuffer, + bool mirrorable) + { + int offset = buffer.Offset; + bool mirrored = false; + + if (mirrorable) + { + info.Buffer = buffer.Buffer?.GetMirrorable(cbs, ref offset, (int)info.Range, out mirrored).Value ?? default; + } + else + { + info.Buffer = buffer.Buffer?.Get(cbs, offset, (int)info.Range, buffer.Write).Value ?? default; + } + + info.Offset = (ulong)offset; + + // The spec requires that buffers with null handle have offset as 0 and range as VK_WHOLE_SIZE. + if (info.Buffer.Handle == 0) + { + info.Buffer = dummyBuffer?.Get(cbs).Value ?? default; + info.Offset = 0; + info.Range = Vk.WholeSize; + } + + return mirrored; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void UpdateAndBind(CommandBufferScoped cbs, ShaderCollection program, int setIndex, PipelineBindPoint pbp) + { + var bindingSegments = program.BindingSegments[setIndex]; + + if (bindingSegments.Length == 0) + { + return; + } + + var dummyBuffer = _dummyBuffer?.GetBuffer(); + + if (_updateDescriptorCacheCbIndex) + { + _updateDescriptorCacheCbIndex = false; + program.UpdateDescriptorCacheCommandBufferIndex(cbs.CommandBufferIndex); + } + + var dsc = program.GetNewDescriptorSetCollection(setIndex, out var isNew).Get(cbs); + + if (!program.HasMinimalLayout) + { + if (isNew) + { + Initialize(cbs, setIndex, dsc); + } + } + + DescriptorSetTemplate template = program.Templates[setIndex]; + + DescriptorSetTemplateWriter tu = _templateUpdater.Begin(template); + + foreach (ResourceBindingSegment segment in bindingSegments) + { + int binding = segment.Binding; + int count = segment.Count; + + if (setIndex == PipelineBase.UniformSetIndex) + { + for (int i = 0; i < count; i++) + { + int index = binding + i; + + if (_uniformSet.Set(index)) + { + ref BufferRef buffer = ref _uniformBufferRefs[index]; + + bool mirrored = UpdateBuffer(cbs, ref _uniformBuffers[index], ref buffer, dummyBuffer, true); + + _uniformMirrored.Set(index, mirrored); + } + } + + ReadOnlySpan uniformBuffers = _uniformBuffers; + + tu.Push(uniformBuffers.Slice(binding, count)); + } + else if (setIndex == PipelineBase.StorageSetIndex) + { + for (int i = 0; i < count; i++) + { + int index = binding + i; + + ref BufferRef buffer = ref _storageBufferRefs[index]; + + if (_storageSet.Set(index)) + { + ref var info = ref _storageBuffers[index]; + + bool mirrored = UpdateBuffer(cbs, + ref info, + ref _storageBufferRefs[index], + dummyBuffer, + !buffer.Write && info.Range <= StorageBufferMaxMirrorable); + + _storageMirrored.Set(index, mirrored); + } + } + + ReadOnlySpan storageBuffers = _storageBuffers; + + tu.Push(storageBuffers.Slice(binding, count)); + } + else if (setIndex == PipelineBase.TextureSetIndex) + { + if (!segment.IsArray) + { + if (segment.Type != ResourceType.BufferTexture) + { + Span textures = _textures; + + for (int i = 0; i < count; i++) + { + ref var texture = ref textures[i]; + ref var refs = ref _textureRefs[binding + i]; + + texture.ImageView = refs.ImageView?.Get(cbs).Value ?? default; + texture.Sampler = refs.Sampler?.Get(cbs).Value ?? default; + + if (texture.ImageView.Handle == 0) + { + texture.ImageView = _dummyTexture.GetImageView().Get(cbs).Value; + } + + if (texture.Sampler.Handle == 0) + { + texture.Sampler = _dummySampler.GetSampler().Get(cbs).Value; + } + } + + tu.Push(textures[..count]); + } + else + { + Span bufferTextures = _bufferTextures; + + for (int i = 0; i < count; i++) + { + bufferTextures[i] = _bufferTextureRefs[binding + i]?.GetBufferView(cbs, false) ?? default; + } + + tu.Push(bufferTextures[..count]); + } + } + else + { + if (segment.Type != ResourceType.BufferTexture) + { + tu.Push(_textureArrayRefs[binding].Array.GetImageInfos(_gd, cbs, _dummyTexture, _dummySampler)); + } + else + { + tu.Push(_textureArrayRefs[binding].Array.GetBufferViews(cbs)); + } + } + } + else if (setIndex == PipelineBase.ImageSetIndex) + { + if (!segment.IsArray) + { + if (segment.Type != ResourceType.BufferImage) + { + Span images = _images; + + for (int i = 0; i < count; i++) + { + images[i].ImageView = _imageRefs[binding + i].ImageView?.Get(cbs).Value ?? default; + } + + tu.Push(images[..count]); + } + else + { + Span bufferImages = _bufferImages; + + for (int i = 0; i < count; i++) + { + bufferImages[i] = _bufferImageRefs[binding + i]?.GetBufferView(cbs, true) ?? default; + } + + tu.Push(bufferImages[..count]); + } + } + else + { + if (segment.Type != ResourceType.BufferTexture) + { + tu.Push(_imageArrayRefs[binding].Array.GetImageInfos(_gd, cbs, _dummyTexture)); + } + else + { + tu.Push(_imageArrayRefs[binding].Array.GetBufferViews(cbs)); + } + } + } + } + + var sets = dsc.GetSets(); + _templateUpdater.Commit(_gd, _device, sets[0]); + + _gd.Api.CmdBindDescriptorSets(cbs.CommandBuffer, pbp, _program.PipelineLayout, (uint)setIndex, 1, sets, 0, ReadOnlySpan.Empty); + } + + private void UpdateAndBindTexturesWithoutTemplate(CommandBufferScoped cbs, ShaderCollection program, PipelineBindPoint pbp) + { + int setIndex = PipelineBase.TextureSetIndex; + var bindingSegments = program.BindingSegments[setIndex]; + + if (bindingSegments.Length == 0) + { + return; + } + + if (_updateDescriptorCacheCbIndex) + { + _updateDescriptorCacheCbIndex = false; + program.UpdateDescriptorCacheCommandBufferIndex(cbs.CommandBufferIndex); + } + + var dsc = program.GetNewDescriptorSetCollection(setIndex, out _).Get(cbs); + + foreach (ResourceBindingSegment segment in bindingSegments) + { + int binding = segment.Binding; + int count = segment.Count; + + if (!segment.IsArray) + { + if (segment.Type != ResourceType.BufferTexture) + { + Span textures = _textures; + + for (int i = 0; i < count; i++) + { + ref var texture = ref textures[i]; + ref var refs = ref _textureRefs[binding + i]; + + texture.ImageView = refs.ImageView?.Get(cbs).Value ?? default; + texture.Sampler = refs.Sampler?.Get(cbs).Value ?? default; + + if (texture.ImageView.Handle == 0) + { + texture.ImageView = _dummyTexture.GetImageView().Get(cbs).Value; + } + + if (texture.Sampler.Handle == 0) + { + texture.Sampler = _dummySampler.GetSampler().Get(cbs).Value; + } + } + + dsc.UpdateImages(0, binding, textures[..count], DescriptorType.CombinedImageSampler); + } + else + { + Span bufferTextures = _bufferTextures; + + for (int i = 0; i < count; i++) + { + bufferTextures[i] = _bufferTextureRefs[binding + i]?.GetBufferView(cbs, false) ?? default; + } + + dsc.UpdateBufferImages(0, binding, bufferTextures[..count], DescriptorType.UniformTexelBuffer); + } + } + else + { + if (segment.Type != ResourceType.BufferTexture) + { + dsc.UpdateImages(0, binding, _textureArrayRefs[binding].Array.GetImageInfos(_gd, cbs, _dummyTexture, _dummySampler), DescriptorType.CombinedImageSampler); + } + else + { + dsc.UpdateBufferImages(0, binding, _textureArrayRefs[binding].Array.GetBufferViews(cbs), DescriptorType.UniformTexelBuffer); + } + } + } + + var sets = dsc.GetSets(); + + _gd.Api.CmdBindDescriptorSets(cbs.CommandBuffer, pbp, _program.PipelineLayout, (uint)setIndex, 1, sets, 0, ReadOnlySpan.Empty); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void UpdateAndBindUniformBufferPd(CommandBufferScoped cbs) + { + int sequence = _pdSequence; + var bindingSegments = _program.BindingSegments[PipelineBase.UniformSetIndex]; + var dummyBuffer = _dummyBuffer?.GetBuffer(); + + long updatedBindings = 0; + DescriptorSetTemplateWriter writer = _templateUpdater.Begin(32 * Unsafe.SizeOf()); + + foreach (ResourceBindingSegment segment in bindingSegments) + { + int binding = segment.Binding; + int count = segment.Count; + + ReadOnlySpan uniformBuffers = _uniformBuffers; + + for (int i = 0; i < count; i++) + { + int index = binding + i; + + if (_uniformSet.Set(index)) + { + ref BufferRef buffer = ref _uniformBufferRefs[index]; + + bool mirrored = UpdateBuffer(cbs, ref _uniformBuffers[index], ref buffer, dummyBuffer, true); + + _uniformMirrored.Set(index, mirrored); + } + + if (_uniformSetPd[index] != sequence) + { + // Need to set this push descriptor (even if the buffer binding has not changed) + + _uniformSetPd[index] = sequence; + updatedBindings |= 1L << index; + + writer.Push(MemoryMarshal.CreateReadOnlySpan(ref _uniformBuffers[index], 1)); + } + } + } + + if (updatedBindings > 0) + { + DescriptorSetTemplate template = _program.GetPushDescriptorTemplate(updatedBindings); + _templateUpdater.CommitPushDescriptor(_gd, cbs, template, _program.PipelineLayout); + } + } + + private void Initialize(CommandBufferScoped cbs, int setIndex, DescriptorSetCollection dsc) + { + // We don't support clearing texture descriptors currently. + if (setIndex != PipelineBase.UniformSetIndex && setIndex != PipelineBase.StorageSetIndex) + { + return; + } + + var dummyBuffer = _dummyBuffer?.GetBuffer().Get(cbs).Value ?? default; + + foreach (ResourceBindingSegment segment in _program.ClearSegments[setIndex]) + { + dsc.InitializeBuffers(0, segment.Binding, segment.Count, segment.Type.Convert(), dummyBuffer); + } + } + + private void BindExtraSets(CommandBufferScoped cbs, ShaderCollection program, PipelineBindPoint pbp) + { + for (int setIndex = PipelineBase.DescriptorSetLayouts; setIndex < program.BindingSegments.Length; setIndex++) + { + var bindingSegments = program.BindingSegments[setIndex]; + + if (bindingSegments.Length == 0) + { + continue; + } + + ResourceBindingSegment segment = bindingSegments[0]; + + if (segment.IsArray) + { + DescriptorSet[] sets = null; + + if (segment.Type == ResourceType.Texture || + segment.Type == ResourceType.Sampler || + segment.Type == ResourceType.TextureAndSampler || + segment.Type == ResourceType.BufferTexture) + { + sets = _textureArrayExtraRefs[setIndex - PipelineBase.DescriptorSetLayouts].Array.GetDescriptorSets( + _device, + cbs, + _templateUpdater, + program, + setIndex, + _dummyTexture, + _dummySampler); + } + else if (segment.Type == ResourceType.Image || segment.Type == ResourceType.BufferImage) + { + sets = _imageArrayExtraRefs[setIndex - PipelineBase.DescriptorSetLayouts].Array.GetDescriptorSets( + _device, + cbs, + _templateUpdater, + program, + setIndex, + _dummyTexture); + } + + if (sets != null) + { + _gd.Api.CmdBindDescriptorSets(cbs.CommandBuffer, pbp, _program.PipelineLayout, (uint)setIndex, 1, sets, 0, ReadOnlySpan.Empty); + } + } + } + } + + public void SignalCommandBufferChange() + { + _updateDescriptorCacheCbIndex = true; + _dirty = DirtyFlags.All; + + _uniformSet.Clear(); + _storageSet.Clear(); + AdvancePdSequence(); + } + + public void ForceTextureDirty() + { + SignalDirty(DirtyFlags.Texture); + } + + public void ForceImageDirty() + { + SignalDirty(DirtyFlags.Image); + } + + private static void SwapBuffer(BufferRef[] list, Auto from, Auto to) + { + for (int i = 0; i < list.Length; i++) + { + if (list[i].Buffer == from) + { + list[i].Buffer = to; + } + } + } + + public void SwapBuffer(Auto from, Auto to) + { + SwapBuffer(_uniformBufferRefs, from, to); + SwapBuffer(_storageBufferRefs, from, to); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + _dummyTexture.Dispose(); + _dummySampler.Dispose(); + _templateUpdater.Dispose(); + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBuffer.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBuffer.cs new file mode 100644 index 000000000..7f8a2a735 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBuffer.cs @@ -0,0 +1,26 @@ +using Silk.NET.Vulkan; +using System; +using Buffer = Silk.NET.Vulkan.Buffer; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableBuffer : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public Buffer Value { get; } + + public DisposableBuffer(Vk api, Device device, Buffer buffer) + { + _api = api; + _device = device; + Value = buffer; + } + + public void Dispose() + { + _api.DestroyBuffer(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBufferView.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBufferView.cs new file mode 100644 index 000000000..3fd230918 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableBufferView.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableBufferView : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public BufferView Value { get; } + + public DisposableBufferView(Vk api, Device device, BufferView bufferView) + { + _api = api; + _device = device; + Value = bufferView; + } + + public void Dispose() + { + _api.DestroyBufferView(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableFramebuffer.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableFramebuffer.cs new file mode 100644 index 000000000..895df9cbf --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableFramebuffer.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableFramebuffer : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public Framebuffer Value { get; } + + public DisposableFramebuffer(Vk api, Device device, Framebuffer framebuffer) + { + _api = api; + _device = device; + Value = framebuffer; + } + + public void Dispose() + { + _api.DestroyFramebuffer(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImage.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImage.cs new file mode 100644 index 000000000..83af7ef4b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImage.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableImage : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public Image Value { get; } + + public DisposableImage(Vk api, Device device, Image image) + { + _api = api; + _device = device; + Value = image; + } + + public void Dispose() + { + _api.DestroyImage(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImageView.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImageView.cs new file mode 100644 index 000000000..e0cf2ad0f --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableImageView.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableImageView : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public ImageView Value { get; } + + public DisposableImageView(Vk api, Device device, ImageView imageView) + { + _api = api; + _device = device; + Value = imageView; + } + + public void Dispose() + { + _api.DestroyImageView(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableMemory.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableMemory.cs new file mode 100644 index 000000000..4ff628a57 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableMemory.cs @@ -0,0 +1,24 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableMemory : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + private readonly DeviceMemory _memory; + + public DisposableMemory(Vk api, Device device, DeviceMemory memory) + { + _api = api; + _device = device; + _memory = memory; + } + + public void Dispose() + { + _api.FreeMemory(_device, _memory, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposablePipeline.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposablePipeline.cs new file mode 100644 index 000000000..e5f5a9743 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposablePipeline.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposablePipeline : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public Pipeline Value { get; } + + public DisposablePipeline(Vk api, Device device, Pipeline pipeline) + { + _api = api; + _device = device; + Value = pipeline; + } + + public void Dispose() + { + _api.DestroyPipeline(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableRenderPass.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableRenderPass.cs new file mode 100644 index 000000000..102e21f28 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableRenderPass.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableRenderPass : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public RenderPass Value { get; } + + public DisposableRenderPass(Vk api, Device device, RenderPass renderPass) + { + _api = api; + _device = device; + Value = renderPass; + } + + public void Dispose() + { + _api.DestroyRenderPass(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableSampler.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableSampler.cs new file mode 100644 index 000000000..5e3caf64a --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/DisposableSampler.cs @@ -0,0 +1,25 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct DisposableSampler : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + + public Sampler Value { get; } + + public DisposableSampler(Vk api, Device device, Sampler sampler) + { + _api = api; + _device = device; + Value = sampler; + } + + public void Dispose() + { + _api.DestroySampler(_device, Value, Span.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/AreaScalingFilter.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/AreaScalingFilter.cs new file mode 100644 index 000000000..a51956694 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/AreaScalingFilter.cs @@ -0,0 +1,101 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; +using Extent2D = Ryujinx.Graphics.GAL.Extents2D; +using Format = Silk.NET.Vulkan.Format; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Effects +{ + internal class AreaScalingFilter : IScalingFilter + { + private readonly VulkanRenderer _renderer; + private PipelineHelperShader _pipeline; + private ISampler _sampler; + private ShaderCollection _scalingProgram; + private Device _device; + + public float Level { get; set; } + + public AreaScalingFilter(VulkanRenderer renderer, Device device) + { + _device = device; + _renderer = renderer; + + Initialize(); + } + + public void Dispose() + { + _pipeline.Dispose(); + _scalingProgram.Dispose(); + _sampler.Dispose(); + } + + public void Initialize() + { + _pipeline = new PipelineHelperShader(_renderer, _device); + + _pipeline.Initialize(); + + var scalingShader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.spv"); + + var scalingResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + _sampler = _renderer.CreateSampler(SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _scalingProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(scalingShader, ShaderStage.Compute, TargetLanguage.Spirv), + }, scalingResourceLayout); + } + + public void Run( + TextureView view, + CommandBufferScoped cbs, + Auto destinationTexture, + Format format, + int width, + int height, + Extent2D source, + Extent2D destination) + { + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_scalingProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _sampler); + + ReadOnlySpan dimensionsBuffer = stackalloc float[] + { + source.X1, + source.X2, + source.Y1, + source.Y2, + destination.X1, + destination.X2, + destination.Y1, + destination.Y2, + }; + + int rangeSize = dimensionsBuffer.Length * sizeof(float); + using var buffer = _renderer.BufferManager.ReserveOrCreate(_renderer, cbs, rangeSize); + buffer.Holder.SetDataUnchecked(buffer.Offset, dimensionsBuffer); + + int threadGroupWorkRegionDim = 16; + int dispatchX = (width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + int dispatchY = (height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, buffer.Range) }); + _pipeline.SetImage(0, destinationTexture); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FsrScalingFilter.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FsrScalingFilter.cs new file mode 100644 index 000000000..801d29418 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FsrScalingFilter.cs @@ -0,0 +1,172 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; +using Extent2D = Ryujinx.Graphics.GAL.Extents2D; +using Format = Silk.NET.Vulkan.Format; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Effects +{ + internal class FsrScalingFilter : IScalingFilter + { + private readonly VulkanRenderer _renderer; + private PipelineHelperShader _pipeline; + private ISampler _sampler; + private ShaderCollection _scalingProgram; + private ShaderCollection _sharpeningProgram; + private float _sharpeningLevel = 1; + private Device _device; + private TextureView _intermediaryTexture; + + public float Level + { + get => _sharpeningLevel; + set + { + _sharpeningLevel = MathF.Max(0.01f, value); + } + } + + public FsrScalingFilter(VulkanRenderer renderer, Device device) + { + _device = device; + _renderer = renderer; + + Initialize(); + } + + public void Dispose() + { + _pipeline.Dispose(); + _scalingProgram.Dispose(); + _sharpeningProgram.Dispose(); + _sampler.Dispose(); + _intermediaryTexture?.Dispose(); + } + + public void Initialize() + { + _pipeline = new PipelineHelperShader(_renderer, _device); + + _pipeline.Initialize(); + + var scalingShader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.spv"); + var sharpeningShader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.spv"); + + var scalingResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + var sharpeningResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 3) + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 4) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + _sampler = _renderer.CreateSampler(SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _scalingProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(scalingShader, ShaderStage.Compute, TargetLanguage.Spirv), + }, scalingResourceLayout); + + _sharpeningProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(sharpeningShader, ShaderStage.Compute, TargetLanguage.Spirv), + }, sharpeningResourceLayout); + } + + public void Run( + TextureView view, + CommandBufferScoped cbs, + Auto destinationTexture, + Format format, + int width, + int height, + Extent2D source, + Extent2D destination) + { + if (_intermediaryTexture == null + || _intermediaryTexture.Info.Width != width + || _intermediaryTexture.Info.Height != height + || !_intermediaryTexture.Info.Equals(view.Info)) + { + var originalInfo = view.Info; + + var info = new TextureCreateInfo( + width, + height, + originalInfo.Depth, + originalInfo.Levels, + originalInfo.Samples, + originalInfo.BlockWidth, + originalInfo.BlockHeight, + originalInfo.BytesPerPixel, + originalInfo.Format, + originalInfo.DepthStencilMode, + originalInfo.Target, + originalInfo.SwizzleR, + originalInfo.SwizzleG, + originalInfo.SwizzleB, + originalInfo.SwizzleA); + _intermediaryTexture?.Dispose(); + _intermediaryTexture = _renderer.CreateTexture(info) as TextureView; + } + + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_scalingProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _sampler); + + float srcWidth = Math.Abs(source.X2 - source.X1); + float srcHeight = Math.Abs(source.Y2 - source.Y1); + float scaleX = srcWidth / view.Width; + float scaleY = srcHeight / view.Height; + + ReadOnlySpan dimensionsBuffer = stackalloc float[] + { + source.X1, + source.X2, + source.Y1, + source.Y2, + destination.X1, + destination.X2, + destination.Y1, + destination.Y2, + scaleX, + scaleY, + }; + + int rangeSize = dimensionsBuffer.Length * sizeof(float); + using var buffer = _renderer.BufferManager.ReserveOrCreate(_renderer, cbs, rangeSize); + buffer.Holder.SetDataUnchecked(buffer.Offset, dimensionsBuffer); + + ReadOnlySpan sharpeningBufferData = stackalloc float[] { 1.5f - (Level * 0.01f * 1.5f) }; + using var sharpeningBuffer = _renderer.BufferManager.ReserveOrCreate(_renderer, cbs, sizeof(float)); + sharpeningBuffer.Holder.SetDataUnchecked(sharpeningBuffer.Offset, sharpeningBufferData); + + int threadGroupWorkRegionDim = 16; + int dispatchX = (width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + int dispatchY = (height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, buffer.Range) }); + _pipeline.SetImage(ShaderStage.Compute, 0, _intermediaryTexture.GetView(FormatTable.ConvertRgba8SrgbToUnorm(view.Info.Format))); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + // Sharpening pass + _pipeline.SetProgram(_sharpeningProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, _intermediaryTexture, _sampler); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(4, sharpeningBuffer.Range) }); + _pipeline.SetImage(0, destinationTexture); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FxaaPostProcessingEffect.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FxaaPostProcessingEffect.cs new file mode 100644 index 000000000..69cecec97 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/FxaaPostProcessingEffect.cs @@ -0,0 +1,88 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Effects +{ + internal class FxaaPostProcessingEffect : IPostProcessingEffect + { + private readonly VulkanRenderer _renderer; + private ISampler _samplerLinear; + private ShaderCollection _shaderProgram; + + private readonly PipelineHelperShader _pipeline; + private TextureView _texture; + + public FxaaPostProcessingEffect(VulkanRenderer renderer, Device device) + { + _renderer = renderer; + _pipeline = new PipelineHelperShader(renderer, device); + + Initialize(); + } + + public void Dispose() + { + _shaderProgram.Dispose(); + _pipeline.Dispose(); + _samplerLinear.Dispose(); + _texture?.Dispose(); + } + + private void Initialize() + { + _pipeline.Initialize(); + + var shader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/Fxaa.spv"); + + var resourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + _samplerLinear = _renderer.CreateSampler(SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _shaderProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(shader, ShaderStage.Compute, TargetLanguage.Spirv), + }, resourceLayout); + } + + public TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height) + { + if (_texture == null || _texture.Width != view.Width || _texture.Height != view.Height) + { + _texture?.Dispose(); + _texture = _renderer.CreateTexture(view.Info) as TextureView; + } + + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_shaderProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear); + + ReadOnlySpan resolutionBuffer = stackalloc float[] { view.Width, view.Height }; + int rangeSize = resolutionBuffer.Length * sizeof(float); + using var buffer = _renderer.BufferManager.ReserveOrCreate(_renderer, cbs, rangeSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, resolutionBuffer); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, buffer.Range) }); + + var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize); + var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize); + + _pipeline.SetImage(ShaderStage.Compute, 0, _texture.GetView(FormatTable.ConvertRgba8SrgbToUnorm(view.Info.Format))); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + + return _texture; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IPostProcessingEffect.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IPostProcessingEffect.cs new file mode 100644 index 000000000..de3a2a8ea --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IPostProcessingEffect.cs @@ -0,0 +1,10 @@ +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Effects +{ + internal interface IPostProcessingEffect : IDisposable + { + const int LocalGroupSize = 64; + TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height); + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IScalingFilter.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IScalingFilter.cs new file mode 100644 index 000000000..a0e35ac6c --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/IScalingFilter.cs @@ -0,0 +1,20 @@ +using Silk.NET.Vulkan; +using System; +using Extent2D = Ryujinx.Graphics.GAL.Extents2D; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Effects +{ + internal interface IScalingFilter : IDisposable + { + float Level { get; set; } + void Run( + TextureView view, + CommandBufferScoped cbs, + Auto destinationTexture, + Format format, + int width, + int height, + Extent2D source, + Extent2D destination); + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.glsl b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.glsl new file mode 100644 index 000000000..e34dd77dd --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.glsl @@ -0,0 +1,122 @@ +// Scaling + +#version 430 core +layout (local_size_x = 16, local_size_y = 16) in; +layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput; +layout( binding = 1, set = 2) uniform sampler2D Source; +layout( binding = 2 ) uniform dimensions{ + float srcX0; + float srcX1; + float srcY0; + float srcY1; + float dstX0; + float dstX1; + float dstY0; + float dstY1; +}; + +/***** Area Sampling *****/ + +// By Sam Belliveau and Filippo Tarpini. Public Domain license. +// Effectively a more accurate sharp bilinear filter when upscaling, +// that also works as a mathematically perfect downscale filter. +// https://entropymine.com/imageworsener/pixelmixing/ +// https://github.com/obsproject/obs-studio/pull/1715 +// https://legacy.imagemagick.org/Usage/filter/ +vec4 AreaSampling(vec2 xy) +{ + // Determine the sizes of the source and target images. + vec2 source_size = vec2(abs(srcX1 - srcX0), abs(srcY1 - srcY0)); + vec2 target_size = vec2(abs(dstX1 - dstX0), abs(dstY1 - dstY0)); + vec2 inverted_target_size = vec2(1.0) / target_size; + + // Compute the top-left and bottom-right corners of the target pixel box. + vec2 t_beg = floor(xy - vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1)); + vec2 t_end = t_beg + vec2(1.0, 1.0); + + // Convert the target pixel box to source pixel box. + vec2 beg = t_beg * inverted_target_size * source_size; + vec2 end = t_end * inverted_target_size * source_size; + + // Compute the top-left and bottom-right corners of the pixel box. + ivec2 f_beg = ivec2(beg); + ivec2 f_end = ivec2(end); + + // Compute how much of the start and end pixels are covered horizontally & vertically. + float area_w = 1.0 - fract(beg.x); + float area_n = 1.0 - fract(beg.y); + float area_e = fract(end.x); + float area_s = fract(end.y); + + // Compute the areas of the corner pixels in the pixel box. + float area_nw = area_n * area_w; + float area_ne = area_n * area_e; + float area_sw = area_s * area_w; + float area_se = area_s * area_e; + + // Initialize the color accumulator. + vec4 avg_color = vec4(0.0, 0.0, 0.0, 0.0); + + // Accumulate corner pixels. + avg_color += area_nw * texelFetch(Source, ivec2(f_beg.x, f_beg.y), 0); + avg_color += area_ne * texelFetch(Source, ivec2(f_end.x, f_beg.y), 0); + avg_color += area_sw * texelFetch(Source, ivec2(f_beg.x, f_end.y), 0); + avg_color += area_se * texelFetch(Source, ivec2(f_end.x, f_end.y), 0); + + // Determine the size of the pixel box. + int x_range = int(f_end.x - f_beg.x - 0.5); + int y_range = int(f_end.y - f_beg.y - 0.5); + + // Accumulate top and bottom edge pixels. + for (int x = f_beg.x + 1; x <= f_beg.x + x_range; ++x) + { + avg_color += area_n * texelFetch(Source, ivec2(x, f_beg.y), 0); + avg_color += area_s * texelFetch(Source, ivec2(x, f_end.y), 0); + } + + // Accumulate left and right edge pixels and all the pixels in between. + for (int y = f_beg.y + 1; y <= f_beg.y + y_range; ++y) + { + avg_color += area_w * texelFetch(Source, ivec2(f_beg.x, y), 0); + avg_color += area_e * texelFetch(Source, ivec2(f_end.x, y), 0); + + for (int x = f_beg.x + 1; x <= f_beg.x + x_range; ++x) + { + avg_color += texelFetch(Source, ivec2(x, y), 0); + } + } + + // Compute the area of the pixel box that was sampled. + float area_corners = area_nw + area_ne + area_sw + area_se; + float area_edges = float(x_range) * (area_n + area_s) + float(y_range) * (area_w + area_e); + float area_center = float(x_range) * float(y_range); + + // Return the normalized average color. + return avg_color / (area_corners + area_edges + area_center); +} + +float insideBox(vec2 v, vec2 bLeft, vec2 tRight) { + vec2 s = step(bLeft, v) - step(tRight, v); + return s.x * s.y; +} + +vec2 translateDest(vec2 pos) { + vec2 translatedPos = vec2(pos.x, pos.y); + translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x; + translatedPos.y = dstY0 < dstY1 ? dstY1 + dstY0 - translatedPos.y - 1 : translatedPos.y; + return translatedPos; +} + +void main() +{ + vec2 bLeft = vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1); + vec2 tRight = vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0); + ivec2 loc = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y); + if (insideBox(loc, bLeft, tRight) == 0) { + imageStore(imgOutput, loc, vec4(0, 0, 0, 1)); + return; + } + + vec4 outColor = AreaSampling(loc); + imageStore(imgOutput, ivec2(translateDest(loc)), vec4(outColor.rgb, 1)); +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/AreaScaling.spv new file mode 100644 index 0000000000000000000000000000000000000000..7d097280f0c781e90318832e0a3cc8463a8beb08 GIT binary patch literal 12428 zcmaKx34C2uwZ?DTq%Ebi6lf_FN<*2c$UImXT84sF3e=&fn5M}kMAM`uX`!f?6tOrI zMMXtLsY5{!P(ej}PjQ~-c|t@{oW+4j!T0~)oR7CTzxR6w7W@0Yz1G@m?X%Ch_l8Lm z7fxw3CN#Eh?9f;=vC+C_H724o&?dLWWy@DCKV*2M=a3_hI#h=ljY%y(eP-g*&e(>n z?dltF;!!4cL|2=I?O2)qllaGFZiciqrZtu=?_9QG#j@q6c6P5@*Eu}0vA@~b-#5_g z?CBe7c8~OJXzDk$@ar4s?qApPSA4XwDgRv&AYH_sobxV?>>)7N_ef@{u^ z4|NR;_jipnPiYR1REwHtHgoF+hw*D~%moh*t{>`db`JMl+-zyPq4o5wZBokMz;L6r zt7=}uL*1*~cz0UOeN;QF(A-nBb4x$(l3MBKKJ^TbJRzU=Pp$NuJgz+<9|uLhRz7>0 z2luk4Co_(84fQrh($mz&612X74b7pEW>051*@|gvoMuetnr3gs`&zs?(9`0R8vTsU zy4Dcb8SnMQ;PZ>Pm$i7}+8U$qF7AEjh1S;A*c{I3ly3>=Y{{<-XW!*lg%4cV;#WV; zy;p6GYr==e=eIu2$M5Af`G(%k?!o>+>o(>l#+9vmKF;rgU)0HA=^f8`AAIBEd{X0n z=P`Sq%MIUH;p4&M`sBtcaI4Sm!J&cX5Y_A7-dG*I+0)x>_0--Nh}PX47-XZk&U<|j2Cf%gw~lVfV@x$o`oT-HChrmO#qfenM* zT_c?4Gft^;P34fT>l*4>Tk-8`eETQx_QuZezO}t8){m@PKmN?|?4S*`YekcHphQcn-)O7GS&LZ|!pD7Qo#nHO=|$7@c>2I{5E2Mm7C9n3=)oyga6# zdu9%``9xWde*d4j{_gl#i~fr%?jEYmhPw|7unE|~72mVs#?J@q>pGe50DFb+2X;Tg z4+I+%?mO3eAASVb{=<(2+grGwLs`#pDb{F=p16oC`kjoaX}hq}az<-ClhJu&+-u(h z_Gc~5Z)9{{-QLGqI{)EvZTY4a`(J)Jd;(_t`xuS)e(CQz*I{0Zm{GqTQ!8yZW^O>! z-(2MtK93!|kqP}=>jRaOn)gla+4ge&7Dj8aztnOsnCnD){}+kJAl?`K{|)EgmTo?| z&w`rsk5zaKZoRYEwEwGK_tJZ3%uW(6ea$KN`SHqr%z>ACv6MOMTZnG{oEz6Y2+jA! zB4(`LXHQM@J{-;c2e#H{XvIE+;@NV2pAFx~?u*>>@VOfQPQmr}eO>zBQ{#UvxaY_B zdHjv{on3O@*(LX#9q#$@{Tyz*@8*&(sQ41P^4a?o>-oI;%p6Zo?%ADa>hbw><>PY} zpU~1vmJF4^ZOR#b3?$@swz1%P7 zeue!OvtMy?-wUp<{~I)Q>%E6jZolR<-aLuF4{Ur{+waiS6E830&6EB5J=isp?+@Vm z{{In8JwAV`d=|5A^86W1J^S$&uyN|{$KM#e+z;pO$Nq`gk2tyi4z92N4>WcA@qHt= zA9EUSp2R-@HomOwK{WNm%gcE4q>qQdu95TeFxWj(H{SQ5n(IBr_#e#AiA}6&jxn3$ z9i{$KxSu~0;A)AP2sY+&k{A94&Al!351+J!V z>m6MAIyVjMbKsosW4U>pn+~=|=X^KIk7LI7>JDIk@0!mz=Wnad{|vD6*D>$?-VyBO z{dR69rsn+;yPltmyI_7MrjOZR`{-ugIk}&ak2WT>V$BiX-N0qfyTjG2-OmmAan$Gc zjy=HB8P%Qlb4D$7>A@rt&8rRf{(fJ)Q#EUIy>10lxi@BC6U_tPG&%X#tf=wx3htVHHQrz2 z{vOVHe&6u7e98SSAMSdiHNLsVx74`5<;!?~%a{D>8uzz+Ilr~wexKP^aKDrKyFLE$ z`)b_Z?dAMjZg$C!FSz}!s`1q|K2Y(zANL~0_oKgIJe!`i1(R>i+T3NSdOfmYb^Qp2b*s{Ecp%qtHPXcF6 z=kwk^1Z+Ldx1M)KwXAm-*!9ZK{lnqv@j0^cDL-3}LQ_voM}v)1Pff>wQn-=_47hrHo>uvkd;N4Y^?cua2G}@tKR?c9^z!q= zxwEimV!n^W$$bvEzWz!yb?aZjDEIT;oW`3c@y`MqU)J_)xO(E{WxRQEZ=M5ojpSPe zHs4|_KIekf9@qDmx>h4D;qnG>P+zYTZnEi;8do|eHi?OWV1zwHW z&v}e;`!T2S=1F`v*!Z%x9=Lkq8ivQ`u98aaRGgWV%_&#z~$7xNrB=lNB;0Q3A# z#hiDo(-_swtaH=ws22a1fL%A=EnW&%)AwRVd3-Mc z8z0}7gVpqX8Kc~LeJ10jn0Ys0#!qH^1*V?&%4J~h%QS3!Zv6YnE8$Bq=gmFJsFvKX z0-O7CEV*9|R!{ENfRo#~yx(36Hn;QU-pr_$+*g3@v8?-baP{PVJvh0YGtU-A_2hX2 z*gR#8Z-lES&zr!>UIZw6aq_*=lEm}kg6eJiGqy75<4TKwMzFXP`1x7NhJ z1FVm_@mDjdCH|dYYYKlCcoa+gwP1bJjlYKRS}gwW2AA>gftTyQ7p{-G@mm?y#M>CJ z$CB&)VEdEb$aph$6Xu@Z!uUbVGn;$22lzvk9-j|aKDjUY+={8^_t0%%juV; zV6NB6+hRM$k5-z#6R{5L3hHp~W0>>SYprUxWA-yv`Fy<6^i8j7>Gck<_aeQ10$ld` zUvQtfve*BHtEbmbg58g@*H6LK)9amJnu--fH{n_kt@>vzDO@$~v#aM|nk;P$bp^6`9rA5%}SKLFQz z{UMrqdi@dDIQ8`UW3cO`*PnpZ^sV>$Q#9wTH@*H0T=x2NxSGD{RV{V?0$lIqZZ!4y z{1RNhxA&l_Z{k_8|6gHuW9t45Vlv~eG4;IHe^Y7qGWz*s+;1^;YrCuBS@(DF`X2rs zO+7w;09#vmrv3<5Pi=n!8>gPy{tQ-6ZGQo)m9_m9u5NAjG0L;<-@x@f`8%5WXtgK* z0IThZnd5#&HNQLklhN;9jt{UG54Je};d1_AM&}*9x9-&=m^o6zzrghx{*9)dm`B0x zYifE7tdIJw)VZATKbX0l^ZTXz0c^aci7jkC@2MY#n;GOcO5r{mS#Kh|tZ@?D8q53B zhNhn9pdDyfKYdMymwoL3FZ-GSub<7CXzJ-}N3e0~>1!6)-05p4u$sR0=X+-~=dCS$?E)_Q znhjUeH+`vjh906H&yb_>o}oF+*+y{JvAK+wvU{TMPPl@b3UE~ zHh0d)Az(Fq>*wRiXwF-kYk2Mt1()aJFu0n&IUj22>u~Uvs%HB-0$%oYB;0de_H`7R zdipvVT=sPgTp#uHbu8H2>FYSKn!feEo`UAQwWY7c;Igl$!qxOmUuwA@CxGks<3xS2 zoQadbYB{I=ZB5N{xP;L&;OIF$nei0N%lT8QdB^O-X<{rjECbhTI2}zrF=v3w_w&=> z`l#oeJ{@ds=RBwKrIhXC2cx&O) z3D-~EeOkq+roZ>83+%bb_mwqpwdCst=X2P)dfmn(@9DN5G!b^syf9o`-J$TT|-25Uh`Sd@cf;H$EG|`l;vty<7}d zFU`NLTEk1wtwW#3IDhi|KN#=fPMFsWEdLK?W`TDs@GP)re`m)0znDwNHwA9JjxUEB z6Fv$z-*jf3AA|F6>u;a>Iv?LnVAlx06kf)@0?xm!Sbd$(-}f#9n_D06g>|VdVIR&U z=9OTze5Tm@X1IEMF0Xv@y-}Z6p{eJdzZz_uy89{5e%jw0%xf1c`#BrzzU|7GecM8u z{&MSkYmMJf@ziz&+&pFduR~Lh&+99nvi>X4)cyQ-y*Gf3Q+J=NL(LrXnSUdA9^;{y z_rX1Q6PkK_uBv>FD16?Grk=kgzXfcZx_?W48>5$>CC*)qU5mLd;^clixW4{7(A3lC zHDK#5_u`#!_58Ma7uYy;`*eTRdL>$-k&ZUxtKz8g(F@$xd>JlUi7fL$Z`-V07X zhZay@+oWlAe#CZ=f$(}A+T}k#(9=M47T_39Nr37&;7X#Y^-{;kAT&elEXdo z8BlZ2?aLnQCF^_)TwmvQH1)*G%Xsr-zdsIkjpVxnoP5U1_0ivF=M&Yu&%M~^{(k@k C(9DGZ literal 0 HcmV?d00001 diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.glsl b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.glsl new file mode 100644 index 000000000..5eb74b3d1 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.glsl @@ -0,0 +1,3945 @@ +// Scaling + +#version 430 core +layout (local_size_x = 64) in; +layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput; +layout( binding = 1, set = 2) uniform sampler2D Source; +layout( binding = 2 ) uniform dimensions{ + float srcX0; + float srcX1; + float srcY0; + float srcY1; + float dstX0; + float dstX1; + float dstY0; + float dstY1; + float scaleX; + float scaleY; +}; + +#define A_GPU 1 +#define A_GLSL 1 +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif + +#define FSR_EASU_F 1 +AU4 con0, con1, con2, con3; +float srcW, srcH, dstW, dstH; +vec2 bLeft, tRight; + +AF2 translate(AF2 pos) { + return AF2(pos.x * scaleX, pos.y * scaleY); +} + +void setBounds(vec2 bottomLeft, vec2 topRight) { + bLeft = bottomLeft; + tRight = topRight; +} + +AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(Source, translate(p), 0); return res; } +AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(Source, translate(p), 1); return res; } +AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(Source, translate(p), 2); return res; } + +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix.x = left 8x8 tile + // pix.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif + + +float insideBox(vec2 v) { + vec2 s = step(bLeft, v) - step(tRight, v); + return s.x * s.y; +} + +AF2 translateDest(AF2 pos) { + AF2 translatedPos = AF2(pos.x, pos.y); + translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x; + translatedPos.y = dstY0 < dstY1 ? dstY1 + dstY0 - translatedPos.y - 1 : translatedPos.y; + return translatedPos; +} + +void CurrFilter(AU2 pos) +{ + if((insideBox(vec2(pos.x, pos.y))) == 0) { + imageStore(imgOutput, ASU2(pos.x, pos.y), AF4(0,0,0,1)); + return; + } + AF3 c; + FsrEasuF(c, AU2(pos.x - bLeft.x, pos.y - bLeft.y), con0, con1, con2, con3); + imageStore(imgOutput, ASU2(translateDest(pos)), AF4(c, 1)); +} + +void main() { + srcW = abs(srcX1 - srcX0); + srcH = abs(srcY1 - srcY0); + dstW = abs(dstX1 - dstX0); + dstH = abs(dstY1 - dstY0); + + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + + setBounds(vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1), + vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0)); + + // Upscaling + FsrEasuCon(con0, con1, con2, con3, + srcW, srcH, // Viewport size (top left aligned) in the input image which is to be scaled. + srcW, srcH, // The size of the input image. + dstW, dstH); // The output resolution. + + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrScaling.spv new file mode 100644 index 0000000000000000000000000000000000000000..c15b72ec6c278e354720e2518f556e6328973ab3 GIT binary patch literal 44672 zcma)_1-Mq@g7jIGXClw(+TYFxM02HJNVFut_Ao*j9+w>H+^Z|uZvM*oz=92=o`tabNR@1K0e z9n>V!r`J@UZmrGr=|6JJx)axJubB3}Q|;Yb+knT99WZL#hymk=Fk`Dzgg#rNj~VS8 zPuJQ3JZ{MN-lHds8a%G;6Tfb)?Xe9UJ%0S?k=qOzI^NiBt$yYlJ*NNgVf&+Wtn1FS zeaDU6Y{0k){reXE?1Vn1@!9PspUq33T??N*fAZ;F`s^{)XV%sMKlyArdepK9PgryP z^5DX${^9Snwjb@F(WBbevD|rvdrGzK13UQY;e$H3rvvusTZ3tbj~X*!{4T?X95QC~ z*zsGA>NkAikP+h;nD!z6FZ=G=r)v$3eaGRGhIERZt~Km`*_ychv5lTEo{l>QnsfO7 zwCzrRv$Q5)Yu03^0b`uaHJlk2H%G@rX~5U>VkGT=O*^nRv;i!R`e<;+HQlAXo`Z*v zZR-c2j~FtljmL^d4{T#E_(3Daw6Q0NbLrjdc-p3>9fyqXSmCCofvc@Q_22)sfY#u( zmwpcer{BZDV>rq5IRZRz_>OISWE=0)#z(dB&TTxojdy9|qroG%pp2a&=AsiHBW~v5 zSaF{Y{W$Sv;+a}!f}62)Ozy!GnAz%V#K(Lm(~cQFv5ijw4{z7owRI}EJSR`<;8V}Z z(>wUobMlN1-kg)&Tj#>LR}LRMr1$8F#dUWM`oSEYBNxz4y$AFeGH!g~cRsd``#|^B zCE!gbj2+u|_=xdC#&TV8v#|Dyr}}hly&@hkc7W%J*U+04f4kxzRQ%J5e^v2+SNw;H z|5$O~RLgp%t$3G;cdK~!icjB(cWccKA3S{I5T+FJX* z(xo-01@m$p7Y7gQxW%{mlJN1(9o}92gwyv!$E8ef-yH>o7t-i`gk%Kj02 z_yI$QB>%^i{gcL?{GV3#FB*IACjXa}{cHGvBlaJb{NGge?;3mZe_z>uXzab4{6ALq zX}EDunlyS;@=psd=cij^PyX(eeTK%K{4-YeS>VpUpV^(?*(&>76<+}Ee8zvl%D!mD zmx4P#@n5F0uTb&T;Ep%`YgG1iI`J;84dDZE_8i*;JgC8&f`>P_FZh54ZwVgS;BCMT z#Mte@2R67rctnGD2JheC-M|h+|2@H@8@vzLg4G9rJ3cRVZ4Ctfv{qeOgW(;YAG@}O zbh34A4X^lt6(3pg(G?$4@q;Qpw&LR|KB3}=RQ%A2A6D_n6`xY^V=I1q#ZRdCi4{Mo z;wM-9)QX>8@iQxacEvBK_(c`Jq~e!W{IZH)Uhyj`er3h4uK2YTzrNx(R{YkA-(K-M zDt>3h@2dFS6~CwA_g4J=ia%KKhb#VQ#h-+8-K@@Si0i0p>zNMjZLfV@y#Svyc68hJ zQWg73#ows-2NnOM;$K$$U+{6`#|{}kX#aM9Gx0(Br|YxhzKjU_7txzF{@$V}B?~2dB5548}JyXSJsralF zpS|L9R($S?&s*^&E53Bam#z4662Rq<^q zzDLFPs`x$?-?!oeDn6*0g&#w5n6+f@y z7gYSBieFOkODld;#c!$jZ56-0;&)X1&Whh%@p~)&NW~xT#JjYfg7X-R_u}rYSHV-C zD-Q0*ZxcE`FU-k9s-LdmZmk92!$%I=zWoTh`3$iX+MvdN9dYqoe%%i4^w_{51V)>W1GJH`@&%(_T6SQ&+4`ubui6%9yD&*C%MMq`_tIEw|}>Y|E{CQ z9=Q3~(G$k#>1VVDJNb2OJyh{WD*kB2AFKG|6@Q}QPgeY?ia%5FXDj}E#b2)Y8x{Xc z#Xm0i7R-RpX?-ox(X^r|tMN87RpZC{w&pX^`0+8bt<79$#uz($p>fK$wYdw8Qrg=5 zg*IKIEmUasXTHS>&1bS|OLA<^&#aEa`ZlLxpU*B1_7&a(9J|k9wU=Bwa@*hQPP=0> zUasAj+}~Bt1YWS={@z;tvxa}#@CD#=(Tn}`Ae;Hb^MY_`E!IA|@|%WM`%vpae#a|T zpAYWb$~*FOTxgT$Fm6S-arUj>N)1-e;duJMjn8;C1FMLRt;9~!8yAft+DbB zt!ryXuw#VF3urIl>FSps`jlo?r%U(NiEZ!Wmkl>T!!HuJ~+Gqf2Rdk?U- znK{fS_jf#M;r@+s(s{O%6>RQ@Ua`RAV7DE2OP)_QvC`cL2H zvp=#NWAVHm#$qnt;Qhggd(lO1wbl*)nHOWLHMNc<;qOj5_u&<}rk(GJaL+C4jdl*2 z=U=JaiY8Z&-!n~YslDH5=8WHeo7htO8E;O;m9cA~IZvh5uhGib1Dn`VJHFA3Prp~9 z>7(X+cwWAN#=oijZ3yK}<)6b{ulRoj_gwQln-7CF(sg>dH0JQ!nqLQA`g(53eJ^Wk z@E&06FlI$Edk!ywW}P`4zqyt}H`GJEcEi1&np^JYQSIS=9xZv#iu-vqcJurBu;jZp zJmcR3?kQz{{r$Yizp32Mi*oP(V*p+Y`djZo@M*y6#>;&lSJOVe!N$by`?_}fS3eA% z^^v~@_r4PQC-6<+D^i91_&Gq$`=7_!tajE1+OF+iD4qlM={?!cxUR8$AzDAjhTEP{ z@$)KvO~vmhxbFde-c3HapLfH3&+zkZ$set_pLa{UpLfG;{k&UpKkt^@&$}h}^KQxg zyjyZV?}j@+e%>v)pLa{{=iQR~c~{hamT@$o(%pMB8O z)BnC;Sp4`L1_V2wcxeq{7kI#XPkN3a$j6hRQ|0BW1si*%@;PkKE{=Fxto7-!A0*BYV zpE=C!b$oE6>+kg@?}>jOY=?k1r&UjmL&4q~${ds6>c1w=oQH$eyk{it2(WS93)1(I zaP`C;1y(bzUsJF;x4yS;f=W-2?K~s;AF;CFSWBUzw_r~@l ztvt4;!HIhcERXHC;KV%xmdExScmRj}JxeQ(?RVh)8{6}=@}B&qg7v=u_WRP0&_2rX z7{}ur)@*IoU|;4m)|`G8{yo?j`&)-r9@~pxb7@?{W;Uf-j*}_nh$a3FS&lHi7rPI@EvQ6U(|B z{%ink{~ZfW`{#;vi1o9p&nCRKea2{h{Vr|vAJN6h;k|t}4qrYeOpaN>=J1(pa$Esc z55KbEx!$jWU%{bnKl1eBGrqYwe9g&`e&zz(&pfordo5T!{JM%?-|&8n%)B>n7^9yy z`&LWe9nV@2c+JO=zI~2t-#&*;UpIl(!*8zmEfv4D;r*C-`?-z79LDLZ-FnnhkLS#y z9KIIjupY0qML4WyFumjs)8Y1-tz3#@KwLZXMPUEy$vszj6@^D{e&C9@3|FQ*M4s6XU(58lm z!TN_k0yZZ6F|cu|@o}))Gpvs_KEYuQ;~wSEZk_6>({pMy4qq#ASlf!UD;Ic`0`t|+ zcAcx!T8Ht!;m|+)DX=l&PlJt{nY#3UhGQBI$D*(H*nSI+&Aj?rvwB(c+HhZG&HnN( z^{-jrwZP8pI<%?jd9eQBzXKZ+{sP#z)b)F?b$K4@t39?C!LgZFUu#xR&E3I0Iebme zVV%>_uFGL<>(TmLL+up~TknrzpEG%XG{>tPGjga~tJ-TE<~gzPc^#~ux^>9QIyQj& zD(hGuZXFxarj9qk))D@8!_)7d+qRB6^?idQb-o3*PIbp&Y&nj1;M&z^rV_{TE{F5z zHDGRYyu;xbw3TDn81Aba!$xq&unBFR5q$v8T#Dc4utu+s4>=a)P|ukD3O0||lsM_}XB^|9`cIn=FFT-Mne?yIb`7u-5GrA?imf~_&@=Zhj{Q@a0KoynfpA%w$He_3Pt0%CnAsM%sqM*H#>^ntqispBy!>oCFSeaIeC^1QK6e7!&n~oHJ4+L{3`fpo*JY)K+n2v5Tam-x zk)`jI!RE1VedXQDr+UV*K3L6h zh|6*8Nj_iYIQD=$j=g9zj*W;b$FWJnGmeen;zU%4LgIaA@A{WeeA+oDJ>(~!&9qRdg&j7G-+A`LGVD*f3uy*1G(VADTkJ#&{ z>|-e0KGZXgVPMxIb?y(>Pdz>dfU`zkgYt~^z&6HCn;av+=Fw&yBWcSzM!~H^J!2gW zHcnf{ItHwsu^t58kGOo+lIxQ?$AP_f*{AmcdFmYB#{3?PHu)!j)sz2V@Brd-?#lH^ z{)ymWO@8k!^5j3Xjrsi)ZSqe7t0(_q-~q(vyp-!>z22K%#+H0Xz?~!Y^nVmMX@qQ_f z|4D7k@1$r`)5&1-XiH6}fYnpeso(+hlXaHsV@;>gmNlIYwZ$1>aOUYkusr@3w=usL zqfJehfX$;VHT@i{o|-NNTT{*_xjxo(8Ett!T@JS<_0)7FIP-J`Se}}$YU5T*JvCho zHjlQ{bPZTNHC+p~ra`pUA=gKI9c?*J*TbzzJvH43&OF@!mZzp)v@y?0X;afpVDo58 zO*ezpQ`0SAYs&c~*Teg!sWO;_Y;M;#xt~T;kIyq; zbC*89MN?1iXTip)AKiRjd=6}%+LG&eu=8!5p4W2yV*eeu9_tHe>hbwKxUT0#H1(sJ z&$Tas)l%Odz~)g;o|nP(*k3_YkI$>%dR|{cQ%~;K!N#c{-HiQ@VEfdTTz>*P_FPMH z{harincqKyJ%_{Jfa`BR+TR4L>F3;OQ%nB0!I|5vc)HTDEs_3+&c3g3Y0xIhYA-oVJWXEq%-aF6UrYxbu`gXM^jbZoKzkHRI=?ui3%AcO>5&aD7ha z9M(Q3SReJo%mwy6%o_EX8?L6WeP~ll&UwJ*44)V5`(DO4A6!j8P@1@71`#O{( z&le{(*!Ppe3jOc~PozDfz(*GNr~*%Juz8Ly@RR}{Q{ZC@d>q*K-V4R?K=i}G{m&)}Z->e`(5GGnRX49|ROVe0ZeI1|T?%a8^8I9KH1*_N25hW) zuG3|~_NOg#pyqeCGCteP$#P)NW#?oCurKFCeR&Qw=R};^Rs`2$UI|UzK0W8k@vQ=P zeCo-uD!3lsYG~>i-|Apv)ib^|!1kpr<5P2dsns^)TNCW{>G;+L`*M8hYjLPKKC$_{ zKFZqHgc($Ps}}!$U~}es;5&f1Y^jdt zrG;OAu(9#mvGCKD&s94$`m)5EOF#Sdo~bSU?gCcR=KWKi_v&528xyadShd9N4ld_x z5BL=PGiQ6k^-(w8`>|T$_X4Yh?+u=xBWt)1SReJQ{k~w|56gSnerW1Pa}K$d1Hfv= zCvG4(an>T&FMSUJTTAA3FgWwCEq+75&R1$23f5QMyu)bKGKc$vH=ri<%%NKH4F{Vq z{D6ih|ABBd{SvPh{}JFaek8n{!%=X3)Dtrr?B|Hg*BH2(zV@L_E&gM{zW;}hgO}IF zc(^|5#veqh7XO36<+X7L+|M65PbR|kQ8#`9ty zu8kub-D|^K`q}THwAzwyGFVO9QMB@`>(OAZ4fVvTCH5F_IcLYh%P}7Z*GJv>DYR;d zKOU?Wege2$*AwCTsOKC%3G6vu-WN|sQ_otT0#-9Vai@Y4XDxF5()Vd#YstKx4$k~* zi{BYw=PT##nP7d@&3hKDTITR)_|}s*@XhBJC*+_V>D{7WlLRpI+cI zz`i%0MVn_`m(XuMfBNj}=8E4@@y9FvY{g$L_|h!sn*}%jTLsttZo##`S8(kg6kPj< z1=s#j!L@%}aP6NIT>Ga5*ZxJpwSQS~?OzvM`!@yG{$0Vfe_wFzKNMX1j|JC0O_$E| z?;em@5AEFwZhZHGYoDRu+T8=PjCT*na9j6)EV+9?hMUhlAj56l1G41q0U56UQUzCc z56IYU-2*aQfA@e4w{;K5lDh|F$=w68+jx?iP!Glk>UQ#?%t6lckjrOyLV*C z#}-`Oy(43{b??ZMyLV*C-8-`6?j0F!eUl5W?%t85{n&zA|M3-f@5uOTckjrOpHy&l z_l}I+*1aQ3?%t6lckjrOpIvb4ckjs3?%t8%+TA-c-1^)*vgGa^S#tM|Ecq1$SHH62 zS6BSnio17Y^4Z>4aO-pL$kOiKk>T3!D7f{zcVz5x_l_*NdqFIC+ABTM@qEAD=hrTx8%e^7Dvmn{9ithoD2 zmiC$WVJqD6xW8n$t@}%s-2Ej>?*5V`cYn!}_o(>d6?c!x(%(HMOTI$E{e0m*ld;SD zRNQ?gOMBmnyU%24A6jwunJn$@Gg)%?nGCmeugQ|T*JR1vZ!+9^rxaZM*n&I%$5-6_ zCKIpS{U*b0PpP>3O~!72?l)O-_nQp2b-&4Q^IuuHyLjK?l)O-_nQp2 zb-&4Q`***|lDpqz$!9OPy8BJWZtH%N;r8!-lOHyLj0 zev>74zsYd>cfZM!Z(MNoUIn**_nVBryidj5Z!&gU_nR!a`%Q-1zxz#=-2EoQZTBj; z{kh*{?DBmpKA_74zsZsxUT}5yn~dFd za>4E2{U&3VpILBq_nVB}{@rh~ehXO5XE*X&!S*qIV>8C*2V?=}x%M zIkHZ7!PPu`mFsjjns#5An|m6qoSS>m^wpNRxeu(Kb-EvHE$Y^0Ef0XpS{{U}dGN2( zIz5D--B)UPxY5d59zoMrTWWa}te$mx3~W8>&Y90@)G}vJfYnmxufaYK%A7q3SM%^y z&e?C!wEIe(Pc>Ru=hJBVYD=BZfYnpyZ^720Zk;|uQcIoBfy+9dhnIE!4zA|GzfSY{ z0)lp5sq^=ZR@V6|el%^;w)evG0R-BUatmcWKoU`&V#cKLpDY`#12O#Ht(n0j*kM{|@%N(DpH{ zJhA@(TeG_Mk7(5r`x!W~pMvFy{Tyt~>c)OTtCrZWz={15EKlq|!Pcy9>=(3ZiTxJr zIil?wT6tps1-52&?O)TXCHCLo#C{K!C-y&JYgRY*J6g5G{ui9sAHnj(;@+-V-Pj*! z)e_qU?D?Y2XN~g2b_H9rx^|yUswH+haAMofA>oOg9&F9(#&)AsOYBVG#QKap43s*z-l3&o6i3f=TS`@CEQu_a3eP9AIO#CC8j#&vkS7%(KifH{2ZR z$uSSu7;VWhFW7V5>(GAYgX^arZGN!*YqKw(zp5q2&%mBX+I+TJ*0B)WI@D9g!eC>x zCC4IQuK{!TJhsfS7~CA{$EVZFP{yoCCAcW&o6C0 z_buyK7H%Etsbe{?G1`)2d9c@pIeb=J=2#JK4)x?%32cnEiTa< zE06!SVEwiX|VE06!aVExPd`@z-q--lK% z9!UEJ^XUE6&*g*Qv(c(+A3&=n9zq*m*Jvo*?+&SJA55#}nwaYVu<_c4)5^16Bf$D= zJCIhMxKZFsntQ}Zuw1{u@YsZvNwG zGxsNevnD4N`biB=Kc~Pq;V}MWT6t`zfiu^qg5|ND0nS{X4wlDu7T9acoM+O?V><`D zePcVDR-Sb@7wkGrrghAY%`rI^$6)Q&Xf5_>ZgZI1bvO@fto=H#^7MN?*qqv&S9#(t z1ZN#C0Lv41F*xgR5m=tMpM$dwmw@HwzmzuXa9M#b2WL&LX!P`R6+Cl&C0HKYHQ-G- z%zZVjJhtn=Ij^q;%VWC%oVmUpERXFMVDC5PypdL(b+`%aIye`O*|9k$$Kn{Q-5RaM zKFw_obGr^VgN-$}^D0ljw}8#5&3Tn4?ly4N;a0FbalZs-9c~B96Zb1{*5MAY-28Xa zW*zP-@ZANz2ke@ppZmZ$ukQuRV|xIcxxODPkL@9F=K4XfJhn%`{W+}ZVOn`?kAe4W zY>(2)vks4gT?gmFF*`QL7vb7iiUz?^ST}y#kge-)rFB+iY!8Q6R| z*FOiV>1%#%YRUfvcyMF;k~TkUcn*FAcW%_pr}j^Prs0YI7OtjW z;?*)A-+?n9$?-j!dVK!iK@{++bt0CC5Bq&!N;bFI+$M%=dg?``2b)e!rpYYXP{mscZN94*vHsGWLZUyL$Rv z7+m(d2wXq)^t&k7+}g4i)M8-k(Pm$MH$zRlIBj`NEdjSKb?r;hF3FL)mTv6oscRW< zS=X{~{nS&}a$s|7Gj9)Cwd7b4tftNHYRJ>)N?^|eb?tt)LoMgxDqx=zl|Lu0il!c) z)xbUnDt%T*Q;*LYV4qo)K5L?>XK&KAz~(kqTVmG+`+TST*>fE<_4up{_W4ce(-TcS zKI?&f)>QhekEWg)HUOL3SZ#^j5bQjZ_m7Rx)Z?=;*m)>@HbGO5PcN|ZQ2O-N&XF26 z1)JMgZLUinTIWZ+8Lf4S`_k&~zH^zq5Ut6N7$7d_B^H}<9 zjiw%-ZNScB>9Z}Gdd9FF*xbfy^ZsOg+k?GtE!EVjpW|4WR$Kbm0bHMV{n6Cpvm>}Z z?{-2{kI&BF`n=l(O+B^l3O2W~+7i1PxIXW8M^lf_9^m@C+Y?PaK6`=d^KNf6_0+Hr z*xbfyOYFYj`n=l@O+7vX!1Z}I5KTQkgTVE9HyBMlH4Fip+gNR`%TQY9M?8$yI>r0b z>Ywv&ICwSIRDb=nr|$#6^?7$7ntFUjfa~*aB$|4BMuF?|ZZw*D#xMqKZez9OygLZ& zdFT70wd&_M`q63=kE6}AD9_dL@VRN#wU4D$6CX?)Uq2fi0{7mou6+Wnn)hHon;i=F z=e+!`dJ=phhkC|$82HWVx;h+9Jw8W({ry3C?H!4x9-pJY{;s3+nT)2Mc{&^L366k2)SM~(wKE^Wus%EiaiI-ZR6M6jPokp*0Jvv%J?|fTkXw3&E~u>2nd9dVDShyPl=bC1~nd z&!2^I-ZR6YH-%`DzH4ZYr$F1Yryj4zaE_R zybdgn?M86c^9HcozJEdMShAisfn85)w?=ET2KzFnvBsF!^}HFZZ`MS^4J~)XFVSQ%ai|caMtrNuspV3 zgR`DbfaUi6B&}n~dj1CNdRn_RTAMZ4mpP3!#=NfQQ(%3wp7QkhG}s(jPq{YN^S9t~ zJ)ea?!=avacn(~@w?2=i9-rTV>-W|d(A4AedvN{U`XZWo*7GH>xsBD9@x2Vrdj0_{ z&-h*iXFXp5%N^@$w2miZ{UbQ*`8rr0+n>Q%&p(0X$^Rxe>-h#)9^2dCtmj)`xqZJw z>sYd$?}A-VYqv&gvj+Pzr?JME*Y$i4tZ&v+o<9EqHb>S|uFduQ09>x;hw%3~)UytM z1=pV$|AwX>pO3)xXU30>D}4SAu0J#W15G{a`3czE#%jy>J_BbxKLyJ(zAwO8&(Fbf z$ND9$)$eCJ zo{V)iaMsiBYRhAr1Dy5rd)xBlp9`Gz^gG=0*yaIeJ^enn+`j!Tw`0kA&Ifiqt=$@} z%^K{>oW>esUe|Meu)bMOdHP%cY>up_T$}5;Ah^6f7lQlW{#4I8EDWxH4p;z@M_LsO4W4{-f+z~X4?S$xOYp7AXM&U*Ttce!Kr`|ge>V_hDc z_4K>%^4L}cXFXQ{%aea)aMsiB#LHt_6`b|-`|)!7_Pg0vYv8nuIJj|ay{39uf?IBbyyeddX}F{d!nhwXFag%S^BJx zrXHUSz^-TMvmu&#)^j7UxsBD9@ofUmdTtDsXMDZES9xdm7r+t%Q$=T>03eQ#sSk@egb?0Qn(*K-GOxt{&u{W#RK4m*Nf&+-}FPH5`!*%|D5mOi_nsmEtmu8b`-0`M4FqRB2Y}_t zKNy_#90ZofHWZxo90Hcx_b^(=lJ(pl?0Qp22kuIEVjfgI{thf!eHvwYthjiw%-F<{rT^f?GkJw9W>u4m~p4oyAlIUa0o zW3^>`2ZOVo6TtF}Zz4GBc?ekUSP!LjJQ?d@;H>8)uspUSz**13!Sduk3Y_&k5-gAH zXmHkZGFWckQ)nGa*7F#!>uK%QXl>SDU*}Y5&lAAq zdY%YBov%HO^TAoq^T6`hE(B*iF96Gv|6*{~^CGZ3wx5Huo|k~- z_I)X>W664626jEI-5RaU8tluQ#u{T@*Yk3)zFALs`n&>cj;yC#o9lTMxLnVx;a75~ zXC1BqyPoBBbuF5De69n#o~6(AXzKB~0qlB~J~yJNXFY!bHn*|bGQOL^SE;%IgK^Oysqc{V12Wm^7Q!t*c@3;dEy=f_iKKBG0#Krl?(nb+<8o{ zN8y>fN5Jy@&h4=_?)W=1`+6MSp>^!(_ypYN&g%Xi)c$@AwhwK2ru`&1&$O*apV)rW z*p?#J+V!=sli}LRePVomeJ+QuvpL)+#=o;Whr{ncoJX6zUY;U0e+%b%{xp1Xig4WW zXW+(Jhy1s2Yh0B>U+wXI7QAE;`y9NCeIA}zeYNNL`R~By*2i&Kms+lg7r<)eGq&Hu z)#LMG<5Pa_df(64|d)zq|Lm&N}ThSJt_ZE z@oyWR+Fpa3r>y^VH1+uWvGFPE{}Y;een0(ZuyN|nlXa+>V{Rt*P4JwwZ_wu4eG9A} zpLZIcw+o+l!Rq;b(tBXz)cqdO2eiKYK908cIsVS!yoi(gLvY>yU(wVv&cA`JyPS)U z;OhAA<(WhYhNWRa&$!EM=pN#Vh zu;cukHaWfotHF_0|Ba>| zpZ_#IWsN_esUOYx;(g*ruyN|fc~1Wq?AXimP-pe5&oppj)uT-dR`s)S(?^G z9A5YCrFAifHUFG8*ZrKt<+@)HeW{ACUvP75P;mF;+OXmqSG-rnH!ZmL@U0r|m=}W2 z1h>vyOACT$MpO6inQ2Rk|Ak8vS1b^jjn9&#=W zR!htx;ChUUqNyilF|e9r>_NLYhk4Wok=HuZmf-OH)jF*=eJl;G>skg)J@dFMSk0K_ zXqV?$oa;901#PX+=(&zogd3+mk~&V34fb<``Y8N8m$u-&x)P&3OH_d%yL3bqw~aJ)Z~v4>v$_kpKVy literal 0 HcmV?d00001 diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.glsl b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.glsl new file mode 100644 index 000000000..785bc0c83 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.glsl @@ -0,0 +1,3904 @@ +// Sharpening +#version 430 core +layout (local_size_x = 64) in; +layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; +layout( binding = 3 ) uniform outvResolution +{ + vec2 outvResolution_data; +}; +layout( binding = 1, set = 2) uniform sampler2D source; +layout( binding = 4 ) uniform sharpening +{ + float sharpening_data; +}; + +#define A_GPU 1 +#define A_GLSL 1 +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif + + +#define FSR_RCAS_F 1 +AU4 con0; + +AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); } +void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {} + +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix.x = left 8x8 tile + // pix.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif + + +void CurrFilter(AU2 pos) +{ + AF3 c; + FsrRcasF(c.r, c.g, c.b, pos, con0); + imageStore(imgOutput, ASU2(pos), AF4(c, 1)); +} + +void main() { + FsrRcasCon(con0, sharpening_data); + + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/FsrSharpening.spv new file mode 100644 index 0000000000000000000000000000000000000000..b2e30e1fe82d1a17c9b837a184d99a88f1c112bb GIT binary patch literal 20472 zcmZvj37nNx`Nvj$;`DZGxsIRT++%V%U%E9@4fFcoH?KWjpOG>a3|Fm$l6sIeO3Cbm&!79rf9?s4wd1YiUaw zo0{p0PrAZ!=}o-$&1mCj`rB&wA1=jBkUBil*g7md(8f<1HGAdo1*7%_*WterePadJ zkr(|a_2nJqObad1@o1w|Ef9qU|-fcw~}uQaLa-PmGQR(&uMB~MhEX!Y)emk z)3hXkHEfMN;`@P{nwM4Lw*ha6uP+*mpVqi?k4dAPLG>Rta}1zgS>l28t4chG{>v3y zUksxkKdq_x|BD}j7V+DIBYr5j8edoJX#C8%ORK|I#~t1gS67TspViow>N|Jn^~J90 zGg?( zH+6+iuX3jXuPJ8GH!(r+5#Z|h-HM~Z6PLHPPHI}*HouiRSrL7X!oIYndOe*aZftE_ zLhsmTWc;j7yrwu8UR{UU;`~lpZE;z~ugdt<8NVju*Jk{>j9;Je8!~=l#&6E}ZSZ9+ z%UkEpug>)W_@Y^EP<8E|P%rVb;F#xn@P;+2DPDxnqq$zMWZG+R(pR+CGwludf>L`Y z)82(IDzy(Y?GyMMt_Jt6OBXO#g1A=9RUobh&nta;h)bVM#HCMf@ch!JkGS;dD=vNd zfftlM{l%rvKym3a7`&+T87eM)hKnmcwZ+ch4Qp0g?2_?aGrniW_s;m(j31cs@fn|( z@hKTUEaTHMJ}cu#W&D^0;!av^v9uGfDOSMe zm%b}IX|=`bPQ0c#4ZfiCUE4{kEza)5Yl`#Xi%Q=MI%&1V#hrLfaT&asH+--6D>`Yl zMSI4t$@ujd|5?Uw&G>B@zdhr3Wc<#Iugmy78NWB<_h76s#{Zo0r!xL5 zd`a_Y?w;y;K9}h)WPE+bU#iA)$5rEB&h%F^{#wT0sK#@bR^#8y^tUtqPR8Fie&vSx zKgjftGX6=%Yr0nAJL<26r|%QpGTuGon^fZ`ME$)oy?4ep&-hmI`Hd$|Ea$%sd{I*y zPe3y(*Z088e@MoMb@)%N`0teIBQrj#!+%1>f6q)GoALch|3*Ikz27Eg`k@)0*5O~R z=g3SyHsi;2_*d(hpXrM;-U46TGRKdNN9*55@W1keaH18Ixx(5O-mL|U*5)pW@XH};w@~IXTtf-P0b4nz5&!%Y}T;BK6_qcTO;dNQ~U>iU#|5# zo$}Wf|Lw$UiVxunSFXahd*yr4!o{Dzui8-|U>a1;8yOE_nnR)o^H<-q>i{K4l-kvJly;3(sU9Rry5Gr!t zd=05~_|#Nvn)P>|26t#xZhUR2`#p7I@U|s)z2)1Ke0a(IuIG5MKIwScVm#L;9Z&9f z>c+>~r19D!-nB{N<;K$$eelw+0VWss1$CnzRB5$bwMC_AYhae$>BZW&g!iN$MDsNW zz6rgWcKv&mSbGDJnf(gXz_@H?R+r=#2}zK8L~evEBw& z&Ns6>w?4*L%aNt7ZD+Xi@VkO9xpl9uur&t;Ut8f--TKa>efqt(P7eLjsy~=jf<17LU!b{W+Qarz$+bt^TVUrEK5v7y=@Yj1 zORn8GYnfeLjE?ocyu#NU5c*?P{qEpfJDFn5I&(VjesK3td-x2f`iu*HN`;TR%Q-&* zc3iJB*I4f7nKt+A>uHK8JDfe?Y<$eyQ+|Ob8c&c!3-le%;UdQ`W zgZuPeX+MC9tw%l?o+9O9DHupWv(G9NsUgFHz9jwjkA>uXy8`q8=WA}h-kGPG&+Kih~*3c7d zKD9NaZ4lbxwRqAS95AvvEYp``_k9_^W=4;0uYqSkE?NP_JU~Op~{oxVkSaSWM zjsc~v=5;8KItGHRLwnRQ2psc@IvTKPKdoGk!C-A^9Yf#|=U8(6qK=`ZuIBYGk2;2d ztwa0AysM0%dEa^;cn09h_maJd^}gAN<~_VIz4`U~OlkiFyEt;Vf4ynGdeO|`zHdr1 z$7b{~&tprwImBs>zHnbY3q_7T3Em>Xo+FrhEBeTN99aM0v%$u!rvAO?kEdx1y%B7l z(C2`)J!z@eE}Uu)K$*1RL!S9&kG2G+I{eSH3%1ol4jUKRW7 z2-0yfXtrOz6$e%h@=p4Kq}?klZhXSj9jLLYUU0k)3dXO%q0JsobG zb?Up87ImHpwodKp#-{T)yVSM!q!QS2~B0aOdEU7E$xL z;FyE&JLe}{`(-7!Mz!;3&cXZV>oniHw8xw;06UKNoIV%Qw8eY#MPTE!>*Kt>LDTLS z)+3L67lXA0|7OWgDPu2zYYY8bV8=hB-0N?H_0evA=W{77_Th4{`=I^QGUf`facYt0 zJ78;3bG_xU-lM>yX})%+#rxMDVAp$3de__ay^5Bu_thD{Cgay;{JM-^pYas?)bBL`u&G#d? z`SjOMT|eid7P)T%M{egO*Uve%)1tl~gZ*qVUO)ACX8Q@)`t@-i<*|=r!TZsC?L%`P z$I$Ofb07Dox0YM!e?hygk*Cd*G4yK=6;(Gc+XvQ8&?|pDT{R1>L*Rvb_Z)p0(eSL3f>xruVTbgUJ z9Zk)B*I#?&co3{D_(LUk@9zUYOw&iZ@z$U%{C@{FKHj^257rjXo__#4uSaOc%k?qd z{C}jG)0%tIKT5M^uO+p}^Cz%*+G*yI>l^x?!Rn9E)a8yD`r}~rD{1O-^?YAuV)HeX z7W+Fb!G|Y!dWjFCpHX7>@d%oGH;dlA^L}`OR$uaG=$}ov`uc>cznF0KR}!xNTEf*| zPq_LU30Hq7;p*=uT>Zm@TjwVk@4}`;{r0sP_dGn+dt}^m@l^M0JmsE`r`$90lzUE| z^8Ok3ygbzhXMAYJJv&eRJwH#m=jSQ+{5<8JpQqgO^OSpjo^sF6Q$9Z9o}Z_>=jSQ+ z{5<8JpQqgO^OSpjo^sF6Q||eB%8$?Zyo{faanI4yc+b&O?m2qOJxfpd>WrV3@wFNE zJUxx~JU!)}si)jC^^|+2p7JX)-k$MmGJbu=Jy%cjd9I#v&(%}zxq8YyS5LX;>M38B z@q03UZ^k`WPvbpTPx(U`_gp>IJy%b;=jtigzN9QpP=3PyIbt zPx)&Z_gp>IJy%b;=jtiY>{Y%(&<3sXi>@o~x(2=jtgRmGM0@J~rc?tEcgvtEb#^ z^^|+Ap7J9zer(2%%ed$2X}ss^Dfe7GXmGP4^z9!?ItEc%qS5Ntw8TVX0)i2Dr z=jy3`NyaZNxzA0b*xx5pEczAsy#>3=5LPjzm&EY==EP;+Wda! zbN63qKGUzKP3PP2)AWC%X}9^}cHjx88t=1=dW`pXa5~=0aBVNq9Pbr+Z86@fV8_!Q ze=yxA{uP^EpjD#(NK(j`smv+xs-f^EpjhjQ0`P@wCTyAA_~q ze5K>P3s?8KxSsw~n)~Oour+j}H$c1N`5Y(@|E^&D+iBrngH5~sJ}1hG|0{ywM5 z!+$fd*F;*+=Ge6B?{lm?{I>w>-%gA9Zi!91{yyi*!+$HV{%Jj1W7Dp`&%yHW-v;b8 zlh(5>HtqWRoGcIj?ZEoC)1saM*tF~4pI#pRgTVTy^)z79uD{RO^6(!5_L@rT*&dsA z{Rh*_!+#i9|8`o`vjaBm`VXa-hyRXX{nL7O!lqsS;q>yzH3IB4tL8JkJhYKuuTM3f z@8zNG2KHK1^I2c+ZzI0PjRH@gH=l9p5w|;7yK#;y4{Z;y{%VdZ4{bEqJZg?B4{a~7 z*SXrB^zzWgfW4;G_NJGIwh!3rSIzGq@|eTEVCP^y^zzWAg1vXt4x^Wcb~xDkL~R>SKz zoO;B~0BbkSapj>M0oGs5apj@S1e-_Aapj>M2_9T(v*_ia9S!z=GwvvQd1#*jdvB>7 zLobgxd=~5+%x9c>#C;B|-8jdUhjuJje>KOIhxU1}dDI+N9@=r>xHpdn&!+kL=Vw(T zxCg!V_}zI9*ci3QF&7-~3-iEfj`?tNXpbBVz{aRWj)mYZv{TEm7lHND9=0a11FEmVv#W zqwY4ae%iyf9PIdNj&%aPw#e}%aNMga!D$_<;MSo%>i9C)7`4dp6>x8w`B&4YIZlF` zLwn>n8ElMN+F>EMB-ww6Av;|#cUXpcJ11RJ9k zInDxm-72JTOby3Ymcr#)=vfgNAXu}-7c7CF8S9$sn} z(5H1=2)7RHQO8AKW7HzYH^AP%=D(Oe&GAjRIkZQPOTfmcMUHQQhti_%Z-e#I9=1!t zj<4oe=hJHwUqOF4E$;R2fE`zRjPYHt_gK{NJ+OY-&CyPu=C~4W4(*ZSDzMjf%it#Gw=2EY2F{f&8t1~-T-!; zBJYi0{j^8ko51E)i@2M?UiYV$?_WO#>!;nZFQeBM{yzb0i}Qq^g89d{grZOjzgxiW zfjNFgFAweKVAoddR(g48zW`g0+HLf5KVy7f|0TGA-h9TX>vuc7TIBc@*fmzWgI*rm zU0~Nq?M`}m5Tjk)>qMLQ&-9Pc!uN5o*Rb}O%M)N@V!S88`e`@EQ}nBe zi5ySE&7nQ6@n^ussYMOiVvOg(@tOY|SRUG6!1XlaUZ9tU_E&JfQd>_ik9GbV*mX9a zaq1EGB6u(@{)Xf4VE$G9zQZ_m$9tAuE#h7QcPX`(>E)sQ1Kgw3UZt0Z_ByzKsl7%o zk2?PewodaIrygel%Zy;{V*1+Fc%H|gb}{Rg~Bsl81v5ADC; zfu;5iy*%ptAJ{t0XPkP(y$c>ri#p!}^RH5;aq8Cj2EAIueF%(-#-D%qfWmFHZZpNj8l)eE@X`@*A6{vO_s7CE*Bd!GgG z5AO|+YiT>UIfv5pRX2~nn-8GH`_3S+?>pKfR|D9+jQ6g=aQ(EK!{6GqMUL&kJCZ|t z#0&*ngPJk^ey=UY*a7VQ7koIpPg4Jm@Tgy3b@TW;ep>&|aO>9|xkiB9X^-{# z9N1Xxj(se>wpg#vgIzD}G1eEr)~sgCG~HDivW*A}%c1#d%(zjgW|SX=0=V6Vw|e_sZUwb57Iye;%<5w{#X zj23xUfVGAGC9r!Jc~^oXufFQ$ZKGFvgsX{5-IKM-}hCJ;CQ=(;j~V`*pCk zxP~qOn^U{>olLJyd=Y(m{eJ^)t=eM^E(RMD`MwF(PrErTp-;b)d<$+4?Q!jY8*H4K zF_+S7i#56o>~}5g>KD>$iyAHmTSM?Gz&^`HP2Yj*qdk1S3r=Ic2lp8_V%p*QXpcIt z1e;4O@>~T@^L!sZILUK0Tp#U`=NhoN)FRIhz-gXq;XYSKJwJr&qdoFm2R4^lN~Av`5TOz-i1+;Xa#3 z%+KKZXpdTN0h>!L^4tne^ZXp{^LXUB4X%&&$ny)Zxzr-hFTrV^+u?p6h&*?|_0b-A zeg!s{TI9JC?B{Rz+y&Q9d-&W9PSH0nf*H63aJDv5_mj9jV(b#<*N%QYi z{TtMyX#U;lG4%f3sekABII%v@%I%-Z_|qAGCgJ9MHsJ@upD(%f`*$eM!5!CH^_8m~ zgzW|JlQcEs= edgeVert; + FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; +/*--------------------------------------------------------------------------*/ + if(!horzSpan) lumaN = lumaW; + if(!horzSpan) lumaS = lumaE; + if(horzSpan) lengthSign = fxaaQualityRcpFrame.y; + FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM; +/*--------------------------------------------------------------------------*/ + FxaaFloat gradientN = lumaN - lumaM; + FxaaFloat gradientS = lumaS - lumaM; + FxaaFloat lumaNN = lumaN + lumaM; + FxaaFloat lumaSS = lumaS + lumaM; + FxaaBool pairN = abs(gradientN) >= abs(gradientS); + FxaaFloat gradient = max(abs(gradientN), abs(gradientS)); + if(pairN) lengthSign = -lengthSign; + FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posB; + posB.x = posM.x; + posB.y = posM.y; + FxaaFloat2 offNP; + offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x; + offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y; + if(!horzSpan) posB.x += lengthSign * 0.5; + if( horzSpan) posB.y += lengthSign * 0.5; +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posN; + posN.x = posB.x - offNP.x * FXAA_QUALITY_P0; + posN.y = posB.y - offNP.y * FXAA_QUALITY_P0; + FxaaFloat2 posP; + posP.x = posB.x + offNP.x * FXAA_QUALITY_P0; + posP.y = posB.y + offNP.y * FXAA_QUALITY_P0; + FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0; + FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN)); + FxaaFloat subpixE = subpixC * subpixC; + FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP)); +/*--------------------------------------------------------------------------*/ + if(!pairN) lumaNN = lumaSS; + FxaaFloat gradientScaled = gradient * 1.0/4.0; + FxaaFloat lumaMM = lumaM - lumaNN * 0.5; + FxaaFloat subpixF = subpixD * subpixE; + FxaaBool lumaMLTZero = lumaMM < 0.0; +/*--------------------------------------------------------------------------*/ + lumaEndN -= lumaNN * 0.5; + lumaEndP -= lumaNN * 0.5; + FxaaBool doneN = abs(lumaEndN) >= gradientScaled; + FxaaBool doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P1; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P1; + FxaaBool doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P1; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P1; +/*--------------------------------------------------------------------------*/ + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P2; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P2; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P2; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P2; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 3) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P3; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P3; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P3; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P3; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 4) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P4; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P4; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P4; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P4; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 5) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P5; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P5; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P5; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P5; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 6) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P6; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P6; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P6; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P6; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 7) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P7; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P7; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P7; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P7; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 8) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P8; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P8; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P8; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P8; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 9) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P9; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P9; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P9; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P9; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 10) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P10; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P10; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P10; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P10; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 11) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P11; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P11; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P11; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P11; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 12) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P12; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P12; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P12; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P12; +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } +/*--------------------------------------------------------------------------*/ + FxaaFloat dstN = posM.x - posN.x; + FxaaFloat dstP = posP.x - posM.x; + if(!horzSpan) dstN = posM.y - posN.y; + if(!horzSpan) dstP = posP.y - posM.y; +/*--------------------------------------------------------------------------*/ + FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; + FxaaFloat spanLength = (dstP + dstN); + FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; + FxaaFloat spanLengthRcp = 1.0/spanLength; +/*--------------------------------------------------------------------------*/ + FxaaBool directionN = dstN < dstP; + FxaaFloat dst = min(dstN, dstP); + FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP; + FxaaFloat subpixG = subpixF * subpixF; + FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5; + FxaaFloat subpixH = subpixG * fxaaQualitySubpix; +/*--------------------------------------------------------------------------*/ + FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0; + FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH); + if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; + if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; + #if (FXAA_DISCARD == 1) + return FxaaTexTop(tex, posM); + #else + return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM); + #endif +} +/*==========================================================================*/ +#endif + +vec4 mainImage(vec2 fragCoord) +{ + vec2 rcpFrame = 1./invResolution_data.xy; + vec2 uv2 = fragCoord.xy / invResolution_data.xy; + + float fxaaQualitySubpix = 0.75; // [0..1], default 0.75 + float fxaaQualityEdgeThreshold = 0.166; // [0.125..0.33], default 0.166 + float fxaaQualityEdgeThresholdMin = 0.02;//0.0625; // ? + vec4 dummy4 = vec4(0.0,0.0,0.0,0.0); + float dummy1 = 0.0; + + vec4 col = FxaaPixelShader(uv2, dummy4, + inputImage, inputImage, inputImage, + rcpFrame, dummy4, dummy4, dummy4, + fxaaQualitySubpix, fxaaQualityEdgeThreshold, + fxaaQualityEdgeThresholdMin, + dummy1, dummy1, dummy1, dummy4); + + vec4 fragColor = vec4( col.xyz, 1. ); + + return fragColor; +} + +void main() +{ + ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec4 outColor = mainImage(texelCoord + vec2(0.5)); + imageStore(imgOutput, texelCoord, outColor); + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/Fxaa.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/Fxaa.spv new file mode 100644 index 0000000000000000000000000000000000000000..b466bcb659d56d910d30aac5d781d27b8ff1313f GIT binary patch literal 25012 zcmaK!34m5r{r+Es0mKdWeL-&>gWNJTx6DjaE6d6@Tilo5=kvbz84tJrzuTP7^E~H!zRS7iz31gJIB=XhDXx9M-u)NRwZssFTsw2{{;_?+F{i~k`+ zXf&qde@yqR-tMlBiQT>YG-D5JtVTblV;*+H8mlj9x806gPwtpEcHZp0I@^0Ebj&z# zR)_fxYpn6V?Is`Jt=*8unw8x_bKAQ*=gdEN>g;`b+Gln!Hng$Ul0Gta;Osfh9oks? z|2w`d<4ZTzS#q3Q3m4nA;;*}8{7G|1i_jL3g-Q6{fp{C9H z|HI}0wugQ;sBEgW)em>DXZEa)UYEFOGvdE({wvmo|HGygYiMJmC2gqLsB`x2>@m*Z z#%7GWZ^zASpV86WLF#RdO`X@%K4VOGcaO!n2i^Saj|1qMd}n(0F=RR(_gg5(VG6ISY^xZsY;tP<^34XqQ&3#mEDV#roU(}7k1lKcCQq6 z(OxU;fX)2gF6>$w&+-9T!wFL(z_A{U*(BZ$8uA zAItX7AC1MJ#>jrYS98u5@EJYr(>gn5&1vn?w#GJv#;tA6+X=0?P8jKD_li%d?)@0} z?Do!{_!w9CXw@>PF&VA3AI3C&9Oj(vUOs6ppY8DO>C-3p88gONt#Ms8`2{WRaFbsK zZ~5CB-qyGZZjNJTP3xa?t#gV!eSQC&vCTO*zCWkm@2yMB8QfS5&l;7Vz5A-U2UMT^ zhv2Pz_G$WlG~?qZAJ);+jW($94BE8rSskr)8q|29;;H%Vj3a2jAqQ`n{}wgApvKRt z@#kv%r5fLkbAMWI^VT=x?b71aTC_C|g3svgp5~s)y*&i27e2nZ|ID+kaadv3s%3EF zh$ZZJ1~l(mTjMD7X`MYCQ|EMc>(q06x?1m?rmtfPU%A)a`f5H)_HEkq!a3zSx(=K^ zy|=@8JPZ46_Ji{*T+(J=%0M5|=vz{s^!|C-*0_wZo*dxKdYvQg10rEg}1&NbanGJqPYeyp>=gn1rKe0 z|DDlw#J*kKN40m2n{{0G)OPRmxVdzdsNABm^dpSOmaqaznyE0z&I|6I%#jAk^({Dgq9s42l za;BjTVpSVWO??D>X?n-iwO_8{`qF-RdTrI$#CipK?FZ1xD|~I@4`)C-=dV~9smWKO zPfjDKVC9M%#~idzi>7XK?*FSfW-Xi0>)Uy&!aqR20nKZRif;in*Kj_1%}Z`w{yE9Z z`J?DJpv7K3y5g?YK43N1%K74bY434u$=-2%ze-b2Ouy7PXBGaxkKT1Th2GrdnR^b{ zc+QP>e#O<3^JU;hW5J%`*ED%!QQ|FzyFW7id~1AE_$C7yjVl)}+&%mRI9I>Fg-t%H z-~OH^U(nBAZt_L__CpNT7&Y28x(MvLDy_%=``K|XS@RWO*FkO9iu-v|GnY~HSHiut zzp5H@esaC3IXK3&xG)r$28u=d_V z>-Zzw{iKPx{0U&$>gF!@v#jR$%N2Hxx!C^=EuX_z!TKLnjoZIljeD=HaaD}H7dAh8 zYr2cYBoO~I}$%Worg)|b(A)GeY!S2 zr_En(Ek27&ZbISO`%DgRewGXF=hbJi<9;J{?fyb@J^g&{jx*Qqua$lpHuAs2)m+yT zu^EN-53qXtyaLvbb;Zv=;p*}8FR)ta=ihMkiNtab{0FRW^*vY->sJ$7yZiJF*64M5 z*UJ7?a(a{gEt;3(Z&%~?<}(WY9k4#GBUT^$J+PYl;Z1DhZ-{C778{qL{^LoPhk(@*SH2WYU-}liuHNsZ!LC8>&nR3CLsO6cWx&?8fH{sY z3)eeU7i>Ja^Vb7AXDVD<$JM+y z-Y3`BUi(qluMY+`Qu792wb97A_iE11z4t!J-EaD{kG~O&mG@#pxLW){w}uS_;r0o(!BgWpkJQ} zt5nz+zDHE|yMgZwgI6!`8Wnb~-;a`?@B4z+F0kLZ)XmrTb-})$3-&!+@CF4QQQ!?L zY@Qp@T#JqAeKwlQ)--FBYvcQ_lKZ}^3B#S>wmn zxbL}2|GwWUx$n2aPXV7=aP#qrOW+=4rPUcnvromT7}_nlVBFRt-RYkXmiUtQzA z!z%sVR^y9m{Jt9ZomDyi@f!DCRXP4_jlWpqFW2}hHST+<(%;)PK48F--l^oia|-wKaafJF7u@;NYJ6tF zo$ouT(x307!kzCssgnCnD%}0%`>1g3eIHeF-$#XOe`~?DUtHt9gNnW5zJm&PeS8O1 za^F9d{Dm6#{Zqz`@B62c`~Ioq3u@f=PvyAppGxlgr*QM#j^}>KcPaQOj2~2R?GGuq zh2~i%_dCdkY1$u3 zw&uGVO&j&4X#SZ+Kb{|s#m^q#($Ai7{TxYU`q_)7jr!;&9?0C59v^$dOCP?pZ}}M7 z^f3;ujk-SEhx^gYXAHgLsrw^f{g>-I9`m#jyo*YPH>o>o5Ip1;P zc^*0zUk8Ef@eW2)kIf`-J>FzA^~C!q*m$1Z&Uf5+6X_j`uS3Dco6_uC`{!femfqrr z!Ob(iKMqbU+B&Z9L+Blg?@xg1zCVf9(p$cdfSXf%9|=z0+B&Z9!|9Kt#rIKQ?^T)i zRJeLX{TE@>DaPraCas3=i?^xy? z2iAA^@!<7nr&r&l=Yh3Rcm7;@wb-8kF6Vz5Zj4i_`6q(4QFs1)dbP~|40!#5pA0re z>R15IdT8soeomraK#QN#!1@k99jwjG+B^PPn%ZlOokj2EpKI06 zq^bFxS?ru4^q-^cO!LnQV#l>}d^r6VXscCxIr=Zswx%7*lgasCqG_W(jJ7N79Gdx^ zOYgXUwsYTnnYIi~-SM+4EwR1=FJpZbUdH+wTpRVoIv=dB3+NqBtgnNOrSAB7^lJG$ zd;?tn%zP6~JvJACcc*RB>|1;PTVVCRs-|y)ou_V`3+dJ3<2zvEls>)-SC7plVB<__ z_N_SI1FI*__rcCnH_pZMYObAYdpXU$b7qy-72wM%UHePHYO%i(Y_8!~fiI>d-Veaq zs5^fly;^d<7VN&+f)*dwfz|W5yB>TEO+7Y01RFPTegxJ|-8fg%t0m5j;Ch^!(9{#> zX0UP86XzE2#k9n^6|9}Qac-bji~a3j_haJT(HvW{zB|#>5HMa~(?#e+E0>adUW)UM*|+7qIth8(Q-ED_A}8{|0tnsK@5-VDD4n`~$3= zx^Z5nS4*6Kg5Bq3oPWX96X)MxtJJNUETm^T^x(u zn_%xp{JsU&R$afZ(W}M&9k8+D_g!%OIu^V4z^-xpYNV~Ye&427i~Uky*D-t`{5E*z z4}xo>?tI@_sEy`aSP8qq;5F&Xbsd7Hu6^^J1~~qg23uSBFt|D0iN8MjW#HPVJKuL8 zYO!Aqyb!zGv*qDx<=$KYZl0-OI9NM%efl0nO`m=ES`loV%PYPTSk3P}tI&J-y+?gz znwsBh#94<`!S#K-8k%}+R{5v4>m9J7(wr49_ky=)XYPi7#o7!CtJ{x z$3|%C#&n;^jhXrOvE3N#ean5{6s)!hEipC&?_7(qIhuODdw&4zTy^7)q*sfNEy3Q$ za=ku?rXHKE!1XvELQ_wit-;PyH;(V-)Z$}XaQ%6-9h!P|K!QNokGS9)WV1C$ryypPKZXd8~5xafC{50(x%kySGus$3&-WYnd z_}L#^#`_4IpQazjVmBUK#yg;}b1Z*$n^5VS;X_~A8Fw7LWBTi3>_D)&_!&Eh-pkLJ z`b3(VpD}TAI2hbUI~hCg*(9)<{?+AvzB1Q7b3Y0mNy~a20_LY#r(?030=^VIpTk4p z+Nv+hpW%%AF)%;P&${Egf-j?)$7HzU**6~tZ_QYFejJXb9-B{q52NMzB-bY6p9Jfp ze6}2crXHIk!TQj~JmlJBy^jJ9rR5Bm3RZLdr_+16{^}jH88p{ltiNgW*6I3c@0d1; ze>Av$PIjWHXTH3gub-^@F<@iF->1Osw5*9-o7B?Z)M&|XzG2{v+pxtHRosE$>7Yhez|tmpIq!y&jN7n z$0^|Cuii&fGkQ(_d6awq47hso|14O|`N{uGaORo6Ts!kmF7~ni zTxDN=PCt*P9-Fhl*^{Z^3ux-O2VVrM`8>!Ob`D%$j>XrPz-Q62zvbFx{9JJIPpmJa zsmJC#aQ0YizJjKne7*`+EAu%Yt}n;p>uX^1$$ph&vnD`Zo9iTGmIdUB)j4*VpGeXzH=~F1Wrvm!PR9pYMUy%6z^L z*Oz1Qbt%|xFU~SZ${}8=e>>mT` zGw;eD2diaIJqlKf{S)9a=byvX%sDnsg57JW;TK>vZOy?kwb(xeE`9zIu2%Yd8t%S` z&tHMnwAH6$YVr9j*f{x~_iHde>=NEb0AlwX*n5}n$O;uSlQ!$0IRvj|48rU9#?;frsf_OXV3i!+)gW> z&3}fgXFvP}toAZ3^Zp7xi&oD28(cl>{dcgM^E2-s;LNioxpsc8lAnEQd<9(A_)oZ+ zHU67EHLCxMre=-e#QYD~8h!p|tzU(!r^eU7YGsYD!_Bpv_Xb=&HNFW}bAIBz1qDaR?GaQz~%gba5dMk zjo!(R%D z`nohVeTenpcZ>DG<(a(!-2I(1djwn?b$PSfL){TZQ06R-5*Uo&!dlk)qKaF zdE?7k=6)1SJvANzRx4|q0?!$n8V^NNPmLb~t2saM4g+VNHOjTK#^h(88b1y$YdjpT zmhn%3%g@gz;c7XXj{rMI-FQB~)iVD`a5=vnuI3s}rOz6wA4OAh4aKQx8o0hsI?&X! zhSR}na_=Zwu79*w5nSDnqBU^V9_-Z9|JvnIKAu3_@CPmP}fmo;|5)vR$AeQH#n zNmH{%abk9Z>-%IjntEzH7OYm**aOcQn;Lu3)KlXeu$uD|Z!S3VtWmC=H6}m%)OZ{? zXY)L8YEnO*re;mz%$W}^_tpt;HP_`t`uI@)G)+w(Vtsfvp9C(?x6i=c-#On-hHInF zPyhM00ODBAw?44vo4WP-tW~q##I;ZEr+~|QdMaGa+)t<}bm^Gw)ouT6yMu8SXsCvc_uWnHcuT^E`05#$SP}ndkZRUgoL(RhpW4 zij(Kpz~$L|0lduf>u|L)&kNzsb1dtrW}b;*pBUc&muK@g;oJf;cM*6o138<& z1=m*HINlSr`29AxJex0u$FF0t`wqDD`(3!U>iTsJ)wJ&;r%S-){r?_Z&2_z$-ph4W z|2|F4brt9Na2dG%%)J~nz-sppKl2vCbH*nBE78<*Ze0adbAIOi0GxT&B-hS+ zlKkvb@8Pa_a{mKd8+CsAbAJiqSaSa(*xc34 z-RHTQxhJlDa{m*!T=zf2)y(}b^j_w!{xVI?+{OCvZ2l{_Je&UpFVDQc!_~?&?;mjI zIhHk6Gtb1ZPoA%U%QgNdT+KZHP48u%>i?ptnWs2;{s&y1&9B1CJYR#Wm3h7ncb;Qe zPc`#Q4Ew}*16-cXZ^F3+WX7Dr?k(_Q268sP4cAuPINlSr_5UXY)E>=cpUc=eJtsuL~~cuLoCi4L6|A8mg~PQ*#Z)sc8hbzE3tpQ_mW1 z1Xe58a3nluY}RmNH1)pfY~BQ{=KRFl6r6e1B-hS0On&yMaWimPZ56Du8}x- zi~*a+7PRELH=25E#)9+tjmdAFqu=CWjCTiY`#I;ZE`+>{c$HCRi{Uh{VKC9LD zr>U8{IC+f+XTKz`1JKlCGXY$$??5#5)C*j(u>+>*rwfH;|Y@GZKLOa;sNl@4R2zs^HPX(KE_%yIJC8rLsHtNnl zie4@DGr;Biqv6ia?|62CDY5zQQMGkkKhx2YAS_&Xl#9QEvh zd0=(F(;1@|tQMaqfQ=jeY4Ez#7oR7>wNZEee0nwQU5C$rtvmBi1{=e5^1JZ@ur})X zeT|dA>X~y2SS|ciuyJw^PXm*F^WQLQ>$th}(L0u0P6wCIhcn=6J|E7c_wxCm{#lxu z&j)d0p9R)$_~*cF1kZYW93>>(AvcpsD9Rd=aeX{LDKCoO#wG*Uo21a-h@2tmmt6ZPb(3`QZBV?`vr4spkT)n)5U7>)_0@ z9=UeblicjhZ8*ta2rmE3_YJu7^Jl(q!nIN7NN(<*1Npt?i{Rg)Id1cEe`u3jzKvf0 z-KLAt)DNnjkKY07$1B=*D@}jVE~zyAPi(aQy8_=sbH4VD>)U^4t6u&$dHmgSuT5w> zu_nvYZ%Xq%ZBC#6zRhLK8&dG=3huw{xv|D?uJKz7J{0@g3-0_o3-0(`1vjU~1$X@J zf;)b1!5zQ9#viEh2Mccehim+i8h@%rd7#Q7n-9_L4B>hXC4*m>&u^j@op zz0WsQ*n50Sg}uMGRoHubM}@twIj_z4$8i1W%X=y}SNGOEVCOpK{gtPV`@q)bnD<(q zc|QSXp7&jzv7Z)W?g4qm9xBG%4|2ym7k&n|R^z!hC=4uJnAoTe+^d4^X@sYn$0WEyF8 tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS + +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImg; +layout(binding = 3, set = 2) uniform sampler2D samplerArea; +layout(binding = 4, set = 2) uniform sampler2D samplerSearch; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; + +void main() { + ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data; + vec2 pixCoord; + vec4 offset[3]; + + SMAABlendingWeightCalculationVS( coord, pixCoord, offset); + + vec4 oColor = SMAABlendingWeightCalculationPS(coord, pixCoord, offset, inputImg, samplerArea, samplerSearch, ivec4(0)); + + imageStore(imgOutput, texelCoord, oColor); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaBlend.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaBlend.spv new file mode 100644 index 0000000000000000000000000000000000000000..8efa011f77f3c49ed762e7465ae685ebd01b8348 GIT binary patch literal 33728 zcmai-2b@*a`L(Yw17h#JjlCDFSOGymQBdq9IKqG^G6(}2D=4N|5-TP#i7kn-L}N78 zSYnDPc1_b5jnNca>>6YFp6AY4%$^hfzi(%D_FC(G-`)2<=UztW+h?8rjlK&t7HKTp zIH#iNXCr4Ra!+k426y*p=5*l_ky19zG|sXL?Vzc}TD&S?`V z;H~tf(U{#nV@mr`Q#-n~7W$+%b(Vr2t5RC$(5W5MCQg|)`Jj#|laKD&y?yG0SyS7) zrgTm_aNLGzX7;3Q4%_;Ootky8L|j+Li4!_IXDn!MWwhy2O0A`_3R>r+Ni#dTY;V!V znpJzlIwo{Z>=-tsee%#LQ>RSpXrD1;!i0{QGn|KqmVsft^t zh}*o18%GNhj<&t6Hk>(ds|8c%*l=l%X=CFiPVShwf5(Z@HUab=XWz!AXcMQ*X!db4 zW5|we8%pQ0r>%kiXIpNTdRtqe)!Q0~mbSJwhTO`I+?mq0hRo<_cT~L^nzU6lrk2JI z__gEXklZ7K;1j3JWHq|yct^_aI$IhB^{mwrjbU)x$ZlvZSN8^)*=(=(L{{_P+StuH zd(C4@V=!7zKUy0@;PtT%HICfc=-=3#a^|d~X0{(UeQL)jwkDecsX6vNDSPI5bjP&G zT}Lk%|AMi%Hbz)SbKM<%_jpHAX1t@oJ>y-Zu`j%*-h)SXOzP@2Osn=>?QI>s#@E`| z2d#Hot&K5g^)Ze$mLuwZIS5>|x1T5Pe;UNcHpdxPYjc0JdYcEJ)!RJK*k+rHG>!oG zw0Xz@(?6p@+dIUVT6>40)!RD^t=`_@##Zf302l2I>pWrF|Fmd(_&E zdq)}DY;Qp0Sg`GRE}9K_z*)HNALp{AF$tfZz1Y&23@^{b(bmUd^bVRL<`5cp47g{% zEZLX=x2<6vT^$p;Mt07aa#H8CuJ)?` zlz9lcU*3T`x7s%1IbPa!fXCJjp58vA{kW!YYvWjLy`HBnjkD2u)}wFZ0(ke__ibDR z?%wk)jf=rOb+t6+*Z8G1enXA>zEQ^CRO9!-o8#aHc8pKSt1-UP%eQai4Q&`5|3|%S zEsc+B{L>om)2DYoT5G(2jW1E-OV#+&HNIAjuU+Ho)c9`ja@_mLo8#tQ5Le^us>RK! z@e^wNq#8f9#!s*DFT-bYvh&n-9_IOkILDqjOgCy%wptC$B`C)IOoBlgryX_Sw&fd*L(UUh5Ut(pbNx_u1SK zp1BwZF6V8V8sDzQhr@gJSxaM&ntjh+yl-Pa_{3@-9tiG!R<$$^@8#FhII_lP!e<^o zqig)c?sGgFo@d)BHQ&=}d`^v@QRC;;`1v({QH@^(FUNd!&3;WU-nVfRd}1}`TfiCf zoxS{88u!%rWAJj!Pa0c2&;L~OeWu2rt?}n-{N);dt;XNz#rrlsf=}(7z^+@k`8ja% z)bS&xb{^F}b=0)kofDd$wnh!B#<*B(qxtUPeY-r^cMtdL>R|SM2QL}TPhQ@$8=y7k zvgy;Wv5B$MX7ATAvvcY!Kf|H-qfK>FFWdNu?OpBN`(d-nZaV|PJb$ZJw6`?2Ml0`@ zZS-023~Om@SL55)_)f;}-Z?dAZ)fwMFA&3BsdWA__AZutJ=M~^v( zKJ;(wSJU?%K5Wzhd=St!u4Ws_`$6}3`ZxBk*$x=J|9(R-j$l50*WeKd>gK*<$O&oM z{jOk9@M`3ds9C>t;HDAm4K=RtroZdAcGJ?s+u*S~-`Y#A-8Suyb6?u!+PV6gcGsb_ z%eANf&ZqYH%e9-V-?_CdliWF#o8``Q5LSNJg~|6Ig8RU zyPsk|54=#t9m9NbqcLZ!@@n=jm8&;xlg_gT8o@``Q>Dj^&9`4CYy6_ z_;;JUv4DTC$*Lyu=dhFzPF(}e`)t_FzJEtpLFw~ zsp}rNKI$ps`*73uioxNJH~I7?_pH1`F57y&X`geSXX`Dn^WfQW4c?}74b)Q~U(J%9 zUr_%5_@qgbc8h)WroG3%t(!AveuKtc1y?gCX7i@coL$4GHu)7p5_23}%^aWO!4E(5 z&`#kq;8!m3@{Wm_3D-wGKC{3NKQwwl%xw6TOB}Nx<^;Gt>WMiOyxyo~c1^wKz#XGG z+J#`}(ft_hdNjFu>be_^G}yWRBiMbCwGjUa?NgFE1KMFoO|u3XrB~&?SU#z zJ#mj#aiz9sOLuJcpY^X)Y29sh-;EofIleM`r~DV)JH%3ojWl=}{6@;T->e1|iI`wmxf-{DH` zJDe%=a{$~w$J&&qsq^hSotoKa>UKIL`N@8_l1<$l%)Uje>#!L|E2Cw94?bHerab53|0+|N1T`ujPhr?6TaFDma)%uc8sn)6$te$%Mfvv}RPCfn6)KkyGV70QI z0dV!PwB=b@1Z>^vyRt&Ahnm>6@I3y9HCr6)+F8CwpO&Otifq!pbY-`+Kklz(z}9gy zZTY^n99YeB^C3R+CCJug-D2By&C~7*mCe$2SE5{*Y_i=|D!XOcT@`E{X?HcSTH2Mb zNVYE96+5r)zct9tLDt7{tXb))e=WG$f3TtrPBjvJqHqE`5`43|3nkU;F0Wvjy4v$T_wi{kE+1 zFI90{!Hre7UC)_X{I>m53rBwHPd8QIKsM3t{tje|dxoJnRzS=yu z^27}Wr#;W0Jhq`=>oU&sC{LZkz}BU0cS^ZTjI)fxgy1f^tvtsEWHDdx2dK?bbUQY|I3>zS`B?yY6H6pQZlpk-fp?p4bPj<}=Xt zwW%4Ov*A9LdmgRNGWCrCmwRb7xV0fvo>LvVHSD_##>Ted_x%rOA6i{Yzvu>k(Vec9dt6tv~hn z96YDe<8v!$2-^ko#n{#Q zTvzD>DQ)8#vT?3q8|C$6=h*wxvB{0e*nR-6kL|`vE5~*dn!eiVWBVbRcC)GMXU6s; z^m1&sz}3vAGA3iYwbC7%ZQM*YE@S&K*snPbPh4E#Q_diLM_m8NcT4<`Pn;EW^j51^?h{%2q{vu(S7_pQ#y&%yVT zP0@Y}0^v_s+19mL8k=<`{ zW6C>X5%`*9)9U2h5o>_G3)Z5vzU?S~OI9!M(1$8LK978c&!cGSp78;ckC7iHtDjkE zPgMHjl#6l8>-Qwt@wJe(C;ln0?P^cEzXPXlj#aK-_WbWFyLCI}Kaj24tnCxVs+MP$ z`(|CTX>GFe(nh%s*?C!yGW+A{O3(gy2Cil{^&!Xi&tU6_@3UYvvq|5tFemo$FJ$e` zmG53`w+ zYD-_8i*CJ7<0G(hXX$6BPbfbnn|$A}GVPYx4-0`EulvFI^!)ZgQ@?|K+7I3WR`WaC z_nDu5lqTPgw6&7etXrJA`-81}IdbY=7)|}Is_sR=YJUG~A5)LzyR5qHFABCD`>nri z*tYt|$d1#t#kTVnJo9vOr;hVxUze~W?OwpFCH080hY&}as^ws8^Tc2lOX}Dt> z!#jk2%fQvlCJSb|dFN|i7C~*G8o+vu$y_Z5w!ZQ#FOO!N$$hDhT4GiJ8*?5xV^|R_ zZ5gAFT4GkJV#@Qsaus8LjnPNVv2K923fQxov9Aj69((tD;c9R-v&rIM*3Nla17f^B z*6;UtITLGwwVO>Aw!ayrwyfP+V6}_cGw#oI;A(4=Gp=>P_Q}0w9diBLQ=S3$howB{ z#C?-7tyjgTZ|lR=%=Wu4*?UgCyyvz-H0anZ1Jqm2{{?s;-tY+Kd)V(Lzy34zJFSvT{ z?$Kbi0c87_dMtBy?+vyc`>nri*tYr*vg5REvFq+#xewU6$vz(g_UvSS$HLW2(e|x0 z<;?kh5Ph|kclS8BW6RyWKUmFdvMBHF0}#~ggMG7JV=`9mw?1mlop<-)VDBUOGL%QayT{&rcYhA9W;R*m z?j8>@UZ48ieI%N8v&q6fd#`EB+8qT})8_ppAH;d~9dQD9O-l8|swH+J*x20r9dNa^ z$(fT$V8@etU#_3~(evZpvMkToP8r+!hPPp~?`O$fs4(F%2ACCj*ua9$RUD^}d z1y)P!iC})3u`>bMPar2&d*=4@U}MWW`6Rgd-0Ds~8LZYuPTVQr#96OgKl|jpej3<$ z$n)TI_^D*|)G-Ha9iETG&4sI{?K8w=;}dr#IB~Wu*UvgU5A(pTQ|9Na%0GMl3vhkZ z>wEr-Xxhyt3*QOU%e^tIX?0F}KKtxe;K3Dk?S~Y2XocG-cPG0~hEq2Gtc>z(+R6KY zJ{J_+x-P2mi)(y-jbB>hH`MrzHGWgU+i35;8h@(BUn#i#d!yhB!#^sx_K$1)(;D}8 z%rk!dTMO>l>R)i}{*HO+@9&tG+}|-T`C2vZ@0gc%f5$w$jr#o^^OE~J=Ha&Q@0o`? zUVqm-T)V$(UUGleyyX6_dCC1<^Ki%O@0!cq=kASj$T>Un;D1+2C@ z+0S+7Q<{7}Yx^qsdt~>aIA`V(@DTiL*F7c|yT2|Z5325|uYvs@z`d`ZcK2NB`Fh2T zb??g4|8IcZ-{mv)GPre`jL}CeG2a9mQ$8QR1viGDW{f^+iTO6zw#$3wa=3ct=?bvg z0P?kbF7yt&60Gifh4YvCt^%juj#J;O(bVTwcjkA%YSxjs?}8KO*yQ^8+;Ux91J@${ z*MQ@nccAa1xd%+vp^sW(t_9cU=Q=cF_-V%IqgJ+S`&qLefQMAR-WPJ)@~*g^oVEWU zIBTz;cJGhWb92Rwb^hh)=a0blvwSzY1#X=tWAsr=%&lN!d=JQc{1|QwKg}3@)Dm+W z*qHK++zwZd&mEOdxj*hiQ=eO{=Urgq)U#)PA|~HW_H5ltY4U7oyN9gix{A}@{os0^ zeu}1kdbOV(0IQ|_pMljjC#RmDgX{JD0!=+WzpQ-9u|0^U9-m)*hR?6j)YG5efYtJy z@NdCt_9ve?9|GG?`(PXTil(o&p3i9Af6nt5ns&3PDw<{X z+~bHtu;tH&o`9>FO;yn>^IiRui0Z!kI|kb^&OPY!PrLPRfcf2f+P{bElh2WV z0IQiz76(^<2K5ZYczt}w^GWB6&t=%bePdKGMp^}RwV_p^#?`5Jg#O7-}^4%R>W z>EB>AeOvKuA^UvxJ51};SD(bc1@4aT{w(%3T+M894&(by&G%innwg*GT<0DCy-IV> zz6rKo{q4gWl-g78```=6S%(k6YUZqA{tV+kXalOe7$b8XKZNts{EVsH_}dx1C#g}?FoX-^#sVN=W4=sEk7dBy;OF_(PrL8G zsb|@W8+#G;%QNQXz|LcNzbubtohD=SQA^AUU}MTNyCRw~$jumi)a--z=Stw*FV4lv z@FA4BBUgdzqpp2*D)J7=^L|xyeN1@{uLf7w*XQsWmEGsCw$;f4$mngPo)D4&DIGI!(stqn4Nr!N!#L+D2%` zAU9+5QA^CmVAr&K&ToRIo_X37tmgU7JMLz1_4+&R=4k1+tWcyc*VsbhPv z?YiGHz8%oi<1?u8Dd%)YH1)aF9P9)(PCfIyGg$p@>UTfx3O2bPwe3Pyb1uYbZ!oyt zry*$Sc@G<^znS*^e7HNf9y1J0Ju$<=MZq&pF`#3^ye_Jn%PwL=Lj_8^+|sY2d6)u1M6c-f5yY@k7vOC z$n{Hqjs%zeX_u4JpQFHPW`3xlXMQFi7_U$I<6WQFiRk*69fv9X>Oiw!<+EoJ+!&L6 z)kiHclflN6_r}q1W31m8ebgME>wOH^nDpsbu$tLq5#Onh()T#Hn%Vjtd*0usRa)NP ztXF^gFojZk>g@!3Uekx^a5XbO&3g0xc05A;{cQ%C?fYj0+Ktcq+f1=GX8C;I-6u6q%lyx}?t}S(*22Q_D z1tnV*ru1VTTKR5w7F^93 z+tQ}yb7)se?|jR|e*vtI-!Y5x9kYGjv)K~&GUfKv##nxjO8M*_0Fx{b1w#~ zUC8*-rds?j1zSh_zXn#DPfq=6@&6{+`qQ6pfz>V}r+&5ge;aK5@xL6b7JqGO`g_-3 z0d{^9e3pR`tj3zU(Mb7Jp}D0_kz6K z3;TAPn*Z%>_QKw1!TVI$eKDrO?u)Tx_r-pc*$3BBclL!oj}+Ya#|p0fiHbYN#$HEu zKVMI2-Io4&_75m;B%8F~RM{(VEdi-saxRssQW(kV@i|n zQ`&ANtJxQE+PV#Ff6Mo`+u`b|`%bXh9c25QdMxwj+joI&$8qRy8}{QTl<9{)cazoZ zhdBMX2W&seG2aVUzm>D%cL(=@)%*-&KT?ln`f)$lcI=z}w&5J8-#~W$ZCmVIx~G2% zcAsR=J^*+AvnPKB*GHY7X8XDKevY8sY;tdB4`|D~-mk#k6Xkb%zpk|MUGFz&`fBTW z*X#a{;kRho%_iGQpC3ZA&pBrg!*k9a1RJZ5ebZ07bLd`lJ}l+gug+8IejM!D80)^5 zi{10Ch2KxOC!d6SAA8R9({6vX{SKV|JPOwLDYEr=wzOxieh;qC)gLOYoU1>g>8mYs zmAm3kXxdGV!#+F>x2^2CXTbGy_GdKp^!r(G`fZ)op^x{Ew(|ZtsM#9t>6E#1#ua$~ z3VVMXP+{+m1Iezz!IZgwo~Q2e{#l3@qj2N<6x=#mD(-w5oBQXlVC%N@{`ni_i)54Q z{ZeJO%)I;^Z2!u$_zyJo%;i79_B+2{^DnqQ>Yn>oC{3O{Z7-A6?29;Uy$ZI!<^A&- zTs`;C>tMCqKdHwu=lbZa30juTy zNj;Y7$Gc$Lv2Xg@hI63)0@?YuZLxFd{qr8!eUkI`KHT}w{qq4_A9a3u-9P_9&~7%l zSF;DS}h^|6eKWw~FSr_{Y2*tJPNwg;=3 zZNp@}o(XMY&qMjHGzjipF1U3ZQ*qbD*xXHffUVooyJ;llD6+}DuxDkr z%)IOcwtwY09gU{$esM1M2HWqvEA0c;Y{# zQ%8d9XR5u@$}@Eon!eg{rgA4tK+|qE*;e{I5zRj5Zt8$*_dAjCU}N>MZ~AF>|G5{P z4@-IWtMin)j|RIo>BkhXn%Oo?*6W$jCiXm(?@`CXy_=3C>!&?ud@4A7^={HPca!H% zd*<;taD5)9Ra!ZZooM=M%RJ_8nvSO3HuD~JJY3x|d%k9XZ9DsOCfIo^_h%QHdd4ve zoN-v6b?D<=r7d^WBGhv%+0;SK{WPh<&HJUo-YrKLcuIkf0lRinDRXa~NITwJ8OJGb zB)4}Q|lk?wB&H+2N=A41EH|ByTqi5fpQTe&=P68XJk9)$t zX;18#V70`~184qb0<@n%nON-^$5~*+>aW>OU1=9r~n>FN0Hu zZOVRJ^oKM~Eh44#i{K|@FJQu*NC;Y;S=iFZeS2NpxQ+`+F;!4{R&3g28 z4()@s)OjgbEqm>2V72*V{q3V#*5vD8{mbXmH{j~=xvcUj*Z!Mm>T~&?&-45(uyN|? z%eTR{m$Ps=SUnUG`^1E=? zF?0MqxSHAIJm=3fzmI0TKK9M$;5FdfYuAGHG3EFFuY=nMZRzLrVEbP7^9Hzje$Vg+ zU^U|tcOy7)j$f`{>^FfOf4L4ngsW%#H-pRZ{|H`={}#BK+4DfLVEjLZ7_U#pe=9iS zzYVO9DdWE#ZXdLzpLc*W4*TW)xD!o1C*b<}--D)}@!tzB z$A2HZ9RK}rHM6N)|DT~5uTRGR0663SIanW4#{UbrebAPE{u1ms%5{1WuAcG#3an;) z;(iTIoa2}47yEC(_4WTPntI0n5V#!w!|-zakHFQ;_4R)Y&3Jt>{zt(X|KnhN%JDw| zw-4IV&nLkdN7nx-H1&-CcVIQ+6Zd;?;vBzRKkKkRf2i!aPyPt^jGCkU3C;HxbDn!o zqq)wTRCO4qU+R4Z?7g2qPx~`m%{cp^O|ASpES$R&$)@sI(gpXq;Mx%qFVy&J6;Gf40=J&<=PI6i?s>SH*|D4QEct7tZHZ<*`Z`Ygpe=R& z4Xl=D$%|mM`DFd=qgvMQC9wYGz4v#xdVKy-`IKklpJ?iHt7pl-z{aVkFE4{_FYEjY zSU+=~C9i_@)0Y0JiJialS@JsE`pUlk8?J7fK1<#JyPk5NC2zuA$IS6ta5b~ZdCs%s zZ8YQc$ys_4hn%H%(DgAUcgeeO`=BlJ^&Z&1m;HPnuAZ~>0a(rW#Qg`HIL9y7FZK_? zj=x-o|H9QX{*S=r_&MIOAtZ8~T*vUkJ@UXiGo)fHMyJ z<+G$OntI0H0#-9VajoFQIexi*vG)Vl*S|lSdd9ynxE%igcsc$>;A&=5x&Dix8Lv;q z@827m@h=Y6ryTzhaQmPw{ah04ILdWe3QaxZUmC1teBzb?C(iN9^^1L3aDDxkLsQTA zmj{>QUjbf@e?_>OxxW4@qZzMH#_!*en(?m!)~6i*s&M(7$4(dOf8&a-44H2dtc#5#=AFZHeq_TJC4WIec= zarQ%-TKO#TZaIT&I*FX;$H@ggrNS#vo?77zDNif#=>?us;Wo;-WS=)@Qu@46Tc19Z z&l~?8k&^pwiIm)bOC;R*?F#-je0afq_U%#d?cn}fB8k`TzaMp!HxId6^Xy&J-^`EFDkg>zpCKcuP(UlUsG`H{<|Wn zU;8Zu*Y3YBQrhn+xc0{iZv2x4*Y3YDQpP`1Ra=teMtCjcS=5Y0V zuet?T&HCS`pId=VeqX6=OR}10L!AC@4Yt3I&2u1+ZCkMIYV%CUwRwiO2hXkIJRkBw zJh8lQb^s5fbY48C)@!_e8Pg!Jb2$%t?#UhD#+qzXAGO5n1UANbNX*V~V={O8s3m3> zaN4o&K2vu^Q;*MXl}|apgVEIUo!$_zTE0IX3br0~=X)5X$@$i{JNYuQ^DVZmHtZw7 z-f5SUGnXUL)Uz)}fz`~WjBO9Nwjunsk>_A9uyx%=o=493Mx$$U4!w)y{+^^`-y7`j zp}975$kwS(=3*bP_uD*d#>}qeud(RbQqR6%>(R%0S=pCSk0XG1?%#z*OvKg2WJh-`8*0;n=waH%Cm+Ozz5Nftf6)4 zlX;s6b{_n_YUAa`#NJWZGxw9w)Z;T5?A&MlxiiSSUrF~LQzq-I`iJ1kqo;AtZjG0NPZjAf(1aP@;PlT8I_VaKxvvXp7 z`K)ykn(_KLR`2`rv)0Mz`j~9ncLlZd_f)X`(RK=@T$^{p>0tM|w$mu(vCRcrm$o^S z^6Zr}<;0i!cOII${%26i^L#i9e5~z}?SpMur+%sP3*hz1^YF>J{36^~llAJOmY6Sr zjj*L(`Ec^=CcC$CX3RW|l()M|9 zZN_
!S;{C6)Gg4N6>lRyg8}{#j^Q^OO*ZR9? z>RIdWfz`~WjQeW1w%qgI2j`ymUQS=HMc3y1UPCF*J%1he2z+zTTcf8s3ejp0FglxGd^2T#B^YiOPNWZr%Xb{_JqmKziM z1BE^9{R~Y#K0gOL_t^(>eezuX1-SmK{$-_=&*}%!^wm~>R{si3yV?0M+aK?tU!&Q6 z_;0}Fv--DiHM8qv&a?UOm@E1jaDk85F{)YGrW z!D@+l0&G1#tF;;P7^S)~?%Sup<-YwLyxh0HhpU-Q?w>rX|A1z^K6zGO!^CCp{t;at z`(WEog4NRBr@{6|+n*@q+Pvfb40f+;dxlaT+h4%erR`ZtdG^Zl;JL)lBgg-*XzKbu zM=8&<`UP;F)%L-*tW&?#`8V+TWcP5!_9EO^llAJOmYA2o#(0hr^LMy0nGb!`67vtR z&+2*P^!=Y`>hbv(*!7uLwJX<0fA8;Cz_y#S|0-C`T%P@x;o6LQi@4Xp)|+Sbzrku| zlYKT>hvR(>T_59J`!~VvJ#BAL%JZy#3+$h-xX*0YSbY-vHdsAld tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS + +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImg; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; + +void main() +{ + vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data; + vec4 offset[3]; + SMAAEdgeDetectionVS(coord, offset); + vec2 oColor = SMAAColorEdgeDetectionPS(coord, offset, inputImg); + if (oColor != float2(-2.0, -2.0)) + { + imageStore(imgOutput, texelCoord, vec4(oColor, 0.0, 1.0)); + } + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaEdge.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaEdge.spv new file mode 100644 index 0000000000000000000000000000000000000000..1062a9e3abd0624aec557d24c06e1a161dec6874 GIT binary patch literal 8464 zcmZ9Q33OG}6^1X6m!RNOkwH<3D54-DqKJqPl8{7$K@tSC7N6k-Vv{^d9+Xt&aRhtmfQ31e^2L}eeS(4Oc+1ADVs1Z zJ20D=Z5o&5Z`oRDtJEs}wZZDhMZI%(4J=x=Yhdvu zr(e2^xDs`Mb;o`Q*Y1~abkS+c#&VRh zX@L5?)A!3;%4VSVo3pR7dn~RoI~K5}y>DGt>*ftzZGD}=H$TmHws&=O_MyA}Rm`<* zJ1V2wszYv|#JO}Zx8}GZ>lP1H2G~k&>t!Cp4cSKUmda4AoZA|*0p`|PbvqNj4cV2< z70z!o>HC;lcMNXbmY+*QcC-5B)mp7OjDIP+4Sl%0yDg>6xjXf-{{Pp|klias9ZMPa zAo{@INO>qlx`%(l4ORQGPR#GY)}hTEL)FX6LtP`gs{LiV+0}-RzKz_0!GsS1UyTRu{8;KfRQ78Z*4LXJ?JL zJf<=0h4Y?nUc0Wnx4mz3ch3fLHD!Hu{F?T*uJvoI)0C~RvvqQ}#@5i3T~udV-`%&a z1!D*6^PXY`jp@xv_ZimivpWfyiF8sa&T}@pqk}V)^BLCP^E@uMJgu8E?4FVKqOKh} zkM}&Z`$jRJx^{=UXTyK_Q1?vK6?M-+9Qob5dgR-P;(mN)564TtW~|!9Q$dZ)vyf@z zYhqT{HVb_svz(&+0A}r;-x2)(K;~J9V_wp&Xa1S!lbMZ?Gv7hX=8|vbd=F-}mUXnv z!257!Ym7sRBlM%lJ_Un*<~$-f$u$!>mDw7RQ@d-mr?U|AxK{O}5r?`pkIA(?D;6c) z`;ui3w}sxE>o0B#{}Xb3&)uQV$@MIo&kEf4iHJV(ir6*JV|I=D#9B@PyS9j31a>_U zD-Pc!V9zb;o&#pw6S2!tja z$uGmM?sF{X`W{Qzm{`xh!QNr>jmL2&*!AgOLU%uM`a2&4r@!`T$!?r>_k9wWu_E?x zGP;~{3@i1WVNT;u0q1AI?0VIGk3{`b6V_k5_t*D`!}Yd+Jum%zURI;KhU5Yo8`@*d z9m%M#In{mFIK12L(RYm_bl){a-FHn<_g&+NGu?phZ;<({#$NdDkyE@6XL8?SO{jVQ z9)l@?U6aqN-_O{$eefGu)csC|ZXf(shHkvy$Bu)yWhCbwfl_=UAy17qVD%B zbp8FFskg9d@73docg^QvIeYMqJ%PwyhmZPy66c+}DxLL{iH|%_p_|8Ziah@X%SRp? zELY4k4o-fpXUoh_+05MXRcg7`hQy!CiF>~4V$b$y^vCHz6WCrjKSJCA%m*S4?URz- z+4Xr}CWFm!JGH%!2ZQB&R-V8|eIjBmbBnEO52Nm($>wa`XD}a*IIKG**`1^A5nyvf z-Kk)?sH=V$VlL~7J!jujGr*of^vmbwNI3b(e-zk#d4Aewg5|Czhx*ZAW8_EDSsVkF zk2=SKtrPt;ZWf$;)M*C0kEnATST5?!1{)(Eb&dziN54-1TU*|<^?8u9#(3rv5xKYv za}#F`@5VfIIp0^V{Z!_Ki0>nNWFGxaOZ>hRw+P)>dF%Sz$%X$JU^#6*bLvYuJ)i$2 zVE@jM_xUr%-vehQ-58%+^~iS)IP&>Ssz=VP1vE?Irdm*600hP5h(8w}TztbNMzz&if`d?`q~Qq%-OE<2=NEMBVej z*1eFr-jQywoFlk3iIZN-uKdk#0gArb=Aio>?Wd;otfAfKC)VVw?z80@on3>^aSzyg zxQ8{UuLaBbd&INSCTG0QsqsEr>b@h?og?3c;6^0!tpm$NK5cTw$9u$>?|a{ z6W$SrzY&+P4&TrI-7EH8?6)Rg3$yn!f5yO`g}>W^J1%j}%yE9R!Ltx;`fBUpWIfyC z5$%eDrRBpp`ZgErS|a8ou$6vA46p^?O^IxZ>W}UfK}*GKIMl zanI4GF0f~_F}?531M4F{9+`vm@Lc&f$@z$O#bFQiv5se?J?g9hTPJ#V0oZeoZ=KI+ z4skk=U2zoGWv{fmF89!fxYp3ugY~(VcZ2qe5Pjq) z^QM#AfS9uZ(e8fjxxD-H%r8OYHX-J2WPTvJg*KAXYvlQ{unE<>y}7cpLY z#9t0JzKleCKUhBEw}2!5GO+fDuYkRquF*ZH>u)aa##W@*vu)`0JsX6RkDgrtmQ(D@ z1SER)Y_K`?sqfhkoOZ?Gn)PvA1I*f8m*+o>6ni#;u21x=ntbG=XWPN%v}f9*XYz3# z&jHu>YzLft_>3l>V$W)D@^R)n!ExrsYmfL{;QF3D7fwFncY`C|T-qc4d0=~HefOZQ zzh~3L{Cs3p(tW0`LiCURydZJ%4(nLg*cT>!A+ve(wTAB}ZBgSzV7VL9xBqIe+#ci( zxNDfNMdaro{%*UT+2QXtZPy`kzH7x#G4D;feB8m8fQxtVrRdXJAidwhA~ z@DD8&)$k&-?O*D$w$v_0m~_#dGWpRc64*zg4Fly9dOze z$2cTqi62{o70|YkDkfLdAt`~-?R6@$%oJTlTWc{ zAApmOGyfpiTK3F%?GgVWaDC4{3@0D)w}K%l;rlUg(f1B?ImNTLcdn_4`Qu2e$(;IH|F&d{J98I$@y>h#eLAuPiMpQz%f}i& z1(s7B)`{o*(_nMzh>U7xr!cPAhDxHI>F z&FP)d9(U#*aP;HzVEefVv6j#I7ZCaI`65`K;+^>tx_tEf%V2AHXN=b#@m~QOUq+(F zSHbcT|21&Ln@fAde;sU}jdu^~`kQMC^EZ$gi2n1K??dW)c7NidXWvAZQ#^C;O7!eo zU~}rDt=O|~qia_ju2~<~buY8_=-GF`#h!f^U7zUL_rUtdN6)?wHm5z)9zBzf^Y{VS z^DOr4hv@R*^P}Wb?Aedu$8kzry$i<%i1TOCB&**ZB!<;v8XWi3Z5bX~k{@b41 uUlDUEvG>1$ja!IlH!jZf@8Bzu>r?#0U^!!4uQs{x{|8vk-#lXf?d5;9EWxh; literal 0 HcmV?d00001 diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.glsl b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.glsl new file mode 100644 index 000000000..df30d727b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.glsl @@ -0,0 +1,1403 @@ +#version 430 core +#define SMAA_GLSL_4 1 + +layout (constant_id = 0) const int SMAA_PRESET_LOW = 0; +layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0; +layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0; +layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0; +layout (constant_id = 4) const float METRIC_WIDTH = 1920.0; +layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0; + +#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT) + +layout (local_size_x = 16, local_size_y = 16) in; +/** + * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com) + * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com) + * Copyright (C) 2013 Belen Masia (bmasia@unizar.es) + * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com) + * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to + * do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. As clarification, there + * is no requirement that the copyright notice and permission be included in + * binary distributions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/** + * _______ ___ ___ ___ ___ + * / || \/ | / \ / \ + * | (---- | \ / | / ^ \ / ^ \ + * \ \ | |\/| | / /_\ \ / /_\ \ + * ----) | | | | | / _____ \ / _____ \ + * |_______/ |__| |__| /__/ \__\ /__/ \__\ + * + * E N H A N C E D + * S U B P I X E L M O R P H O L O G I C A L A N T I A L I A S I N G + * + * http://www.iryoku.com/smaa/ + * + * Hi, welcome aboard! + * + * Here you'll find instructions to get the shader up and running as fast as + * possible. + * + * IMPORTANTE NOTICE: when updating, remember to update both this file and the + * precomputed textures! They may change from version to version. + * + * The shader has three passes, chained together as follows: + * + * |input|------------------ + * v | + * [ SMAA*EdgeDetection ] | + * v | + * |edgesTex| | + * v | + * [ SMAABlendingWeightCalculation ] | + * v | + * |blendTex| | + * v | + * [ SMAANeighborhoodBlending ] <------ + * v + * |output| + * + * Note that each [pass] has its own vertex and pixel shader. Remember to use + * oversized triangles instead of quads to avoid overshading along the + * diagonal. + * + * You've three edge detection methods to choose from: luma, color or depth. + * They represent different quality/performance and anti-aliasing/sharpness + * tradeoffs, so our recommendation is for you to choose the one that best + * suits your particular scenario: + * + * - Depth edge detection is usually the fastest but it may miss some edges. + * + * - Luma edge detection is usually more expensive than depth edge detection, + * but catches visible edges that depth edge detection can miss. + * + * - Color edge detection is usually the most expensive one but catches + * chroma-only edges. + * + * For quickstarters: just use luma edge detection. + * + * The general advice is to not rush the integration process and ensure each + * step is done correctly (don't try to integrate SMAA T2x with predicated edge + * detection from the start!). Ok then, let's go! + * + * 1. The first step is to create two RGBA temporal render targets for holding + * |edgesTex| and |blendTex|. + * + * In DX10 or DX11, you can use a RG render target for the edges texture. + * In the case of NVIDIA GPUs, using RG render targets seems to actually be + * slower. + * + * On the Xbox 360, you can use the same render target for resolving both + * |edgesTex| and |blendTex|, as they aren't needed simultaneously. + * + * 2. Both temporal render targets |edgesTex| and |blendTex| must be cleared + * each frame. Do not forget to clear the alpha channel! + * + * 3. The next step is loading the two supporting precalculated textures, + * 'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as + * C++ headers, and also as regular DDS files. They'll be needed for the + * 'SMAABlendingWeightCalculation' pass. + * + * If you use the C++ headers, be sure to load them in the format specified + * inside of them. + * + * You can also compress 'areaTex' and 'searchTex' using BC5 and BC4 + * respectively, if you have that option in your content processor pipeline. + * When compressing then, you get a non-perceptible quality decrease, and a + * marginal performance increase. + * + * 4. All samplers must be set to linear filtering and clamp. + * + * After you get the technique working, remember that 64-bit inputs have + * half-rate linear filtering on GCN. + * + * If SMAA is applied to 64-bit color buffers, switching to point filtering + * when accesing them will increase the performance. Search for + * 'SMAASamplePoint' to see which textures may benefit from point + * filtering, and where (which is basically the color input in the edge + * detection and resolve passes). + * + * 5. All texture reads and buffer writes must be non-sRGB, with the exception + * of the input read and the output write in + * 'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in + * this last pass are not possible, the technique will work anyway, but + * will perform antialiasing in gamma space. + * + * IMPORTANT: for best results the input read for the color/luma edge + * detection should *NOT* be sRGB. + * + * 6. Before including SMAA.h you'll have to setup the render target metrics, + * the target and any optional configuration defines. Optionally you can + * use a preset. + * + * You have the following targets available: + * SMAA_HLSL_3 + * SMAA_HLSL_4 + * SMAA_HLSL_4_1 + * SMAA_GLSL_3 * + * SMAA_GLSL_4 * + * + * * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below). + * + * And four presets: + * SMAA_PRESET_LOW (%60 of the quality) + * SMAA_PRESET_MEDIUM (%80 of the quality) + * SMAA_PRESET_HIGH (%95 of the quality) + * SMAA_PRESET_ULTRA (%99 of the quality) + * + * For example: + * #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0) + * #define SMAA_HLSL_4 + * #define SMAA_PRESET_HIGH + * #include "SMAA.h" + * + * Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a + * uniform variable. The code is designed to minimize the impact of not + * using a constant value, but it is still better to hardcode it. + * + * Depending on how you encoded 'areaTex' and 'searchTex', you may have to + * add (and customize) the following defines before including SMAA.h: + * #define SMAA_AREATEX_SELECT(sample) sample.rg + * #define SMAA_SEARCHTEX_SELECT(sample) sample.r + * + * If your engine is already using porting macros, you can define + * SMAA_CUSTOM_SL, and define the porting functions by yourself. + * + * 7. Then, you'll have to setup the passes as indicated in the scheme above. + * You can take a look into SMAA.fx, to see how we did it for our demo. + * Checkout the function wrappers, you may want to copy-paste them! + * + * 8. It's recommended to validate the produced |edgesTex| and |blendTex|. + * You can use a screenshot from your engine to compare the |edgesTex| + * and |blendTex| produced inside of the engine with the results obtained + * with the reference demo. + * + * 9. After you get the last pass to work, it's time to optimize. You'll have + * to initialize a stencil buffer in the first pass (discard is already in + * the code), then mask execution by using it the second pass. The last + * pass should be executed in all pixels. + * + * + * After this point you can choose to enable predicated thresholding, + * temporal supersampling and motion blur integration: + * + * a) If you want to use predicated thresholding, take a look into + * SMAA_PREDICATION; you'll need to pass an extra texture in the edge + * detection pass. + * + * b) If you want to enable temporal supersampling (SMAA T2x): + * + * 1. The first step is to render using subpixel jitters. I won't go into + * detail, but it's as simple as moving each vertex position in the + * vertex shader, you can check how we do it in our DX10 demo. + * + * 2. Then, you must setup the temporal resolve. You may want to take a look + * into SMAAResolve for resolving 2x modes. After you get it working, you'll + * probably see ghosting everywhere. But fear not, you can enable the + * CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro. + * Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded. + * + * 3. The next step is to apply SMAA to each subpixel jittered frame, just as + * done for 1x. + * + * 4. At this point you should already have something usable, but for best + * results the proper area textures must be set depending on current jitter. + * For this, the parameter 'subsampleIndices' of + * 'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x + * mode: + * + * @SUBSAMPLE_INDICES + * + * | S# | Camera Jitter | subsampleIndices | + * +----+------------------+---------------------+ + * | 0 | ( 0.25, -0.25) | float4(1, 1, 1, 0) | + * | 1 | (-0.25, 0.25) | float4(2, 2, 2, 0) | + * + * These jitter positions assume a bottom-to-top y axis. S# stands for the + * sample number. + * + * More information about temporal supersampling here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * c) If you want to enable spatial multisampling (SMAA S2x): + * + * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be + * created with: + * - DX10: see below (*) + * - DX10.1: D3D10_STANDARD_MULTISAMPLE_PATTERN or + * - DX11: D3D11_STANDARD_MULTISAMPLE_PATTERN + * + * This allows to ensure that the subsample order matches the table in + * @SUBSAMPLE_INDICES. + * + * (*) In the case of DX10, we refer the reader to: + * - SMAA::detectMSAAOrder and + * - SMAA::msaaReorder + * + * These functions allow to match the standard multisample patterns by + * detecting the subsample order for a specific GPU, and reordering + * them appropriately. + * + * 2. A shader must be run to output each subsample into a separate buffer + * (DX10 is required). You can use SMAASeparate for this purpose, or just do + * it in an existing pass (for example, in the tone mapping pass, which has + * the advantage of feeding tone mapped subsamples to SMAA, which will yield + * better results). + * + * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing + * the results in the final buffer. The second run should alpha blend with + * the existing final buffer using a blending factor of 0.5. + * 'subsampleIndices' must be adjusted as in the SMAA T2x case (see point + * b). + * + * d) If you want to enable temporal supersampling on top of SMAA S2x + * (which actually is SMAA 4x): + * + * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is + * to calculate SMAA S2x for current frame. In this case, 'subsampleIndices' + * must be set as follows: + * + * | F# | S# | Camera Jitter | Net Jitter | subsampleIndices | + * +----+----+--------------------+-------------------+----------------------+ + * | 0 | 0 | ( 0.125, 0.125) | ( 0.375, -0.125) | float4(5, 3, 1, 3) | + * | 0 | 1 | ( 0.125, 0.125) | (-0.125, 0.375) | float4(4, 6, 2, 3) | + * +----+----+--------------------+-------------------+----------------------+ + * | 1 | 2 | (-0.125, -0.125) | ( 0.125, -0.375) | float4(3, 5, 1, 4) | + * | 1 | 3 | (-0.125, -0.125) | (-0.375, 0.125) | float4(6, 4, 2, 4) | + * + * These jitter positions assume a bottom-to-top y axis. F# stands for the + * frame number. S# stands for the sample number. + * + * 2. After calculating SMAA S2x for current frame (with the new subsample + * indices), previous frame must be reprojected as in SMAA T2x mode (see + * point b). + * + * e) If motion blur is used, you may want to do the edge detection pass + * together with motion blur. This has two advantages: + * + * 1. Pixels under heavy motion can be omitted from the edge detection process. + * For these pixels we can just store "no edge", as motion blur will take + * care of them. + * 2. The center pixel tap is reused. + * + * Note that in this case depth testing should be used instead of stenciling, + * as we have to write all the pixels in the motion blur pass. + * + * That's it! + */ + +//----------------------------------------------------------------------------- +// SMAA Presets + +/** + * Note that if you use one of these presets, the following configuration + * macros will be ignored if set in the "Configurable Defines" section. + */ + +#if defined(SMAA_PRESET_LOW) +#define SMAA_THRESHOLD 0.15 +#define SMAA_MAX_SEARCH_STEPS 4 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_MEDIUM) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 8 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_HIGH) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 16 +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#define SMAA_CORNER_ROUNDING 25 +#elif defined(SMAA_PRESET_ULTRA) +#define SMAA_THRESHOLD 0.05 +#define SMAA_MAX_SEARCH_STEPS 32 +#define SMAA_MAX_SEARCH_STEPS_DIAG 16 +#define SMAA_CORNER_ROUNDING 25 +#endif + +//----------------------------------------------------------------------------- +// Configurable Defines + +/** + * SMAA_THRESHOLD specifies the threshold or sensitivity to edges. + * Lowering this value you will be able to detect more edges at the expense of + * performance. + * + * Range: [0, 0.5] + * 0.1 is a reasonable value, and allows to catch most visible edges. + * 0.05 is a rather overkill value, that allows to catch 'em all. + * + * If temporal supersampling is used, 0.2 could be a reasonable value, as low + * contrast edges are properly filtered by just 2x. + */ +#ifndef SMAA_THRESHOLD +#define SMAA_THRESHOLD 0.1 +#endif + +/** + * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection. + * + * Range: depends on the depth range of the scene. + */ +#ifndef SMAA_DEPTH_THRESHOLD +#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD) +#endif + +/** + * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the + * horizontal/vertical pattern searches, at each side of the pixel. + * + * In number of pixels, it's actually the double. So the maximum line length + * perfectly handled by, for example 16, is 64 (by perfectly, we meant that + * longer lines won't look as good, but still antialiased). + * + * Range: [0, 112] + */ +#ifndef SMAA_MAX_SEARCH_STEPS +#define SMAA_MAX_SEARCH_STEPS 16 +#endif + +/** + * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the + * diagonal pattern searches, at each side of the pixel. In this case we jump + * one pixel at time, instead of two. + * + * Range: [0, 20] + * + * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 + * steps), but it can have a significant impact on older machines. + * + * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing. + */ +#ifndef SMAA_MAX_SEARCH_STEPS_DIAG +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#endif + +/** + * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded. + * + * Range: [0, 100] + * + * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing. + */ +#ifndef SMAA_CORNER_ROUNDING +#define SMAA_CORNER_ROUNDING 25 +#endif + +/** + * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times + * bigger contrast than current edge, current edge will be discarded. + * + * This allows to eliminate spurious crossing edges, and is based on the fact + * that, if there is too much contrast in a direction, that will hide + * perceptually contrast in the other neighbors. + */ +#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR +#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0 +#endif + +/** + * Predicated thresholding allows to better preserve texture details and to + * improve performance, by decreasing the number of detected edges using an + * additional buffer like the light accumulation buffer, object ids or even the + * depth buffer (the depth buffer usage may be limited to indoor or short range + * scenes). + * + * It locally decreases the luma or color threshold if an edge is found in an + * additional buffer (so the global threshold can be higher). + * + * This method was developed by Playstation EDGE MLAA team, and used in + * Killzone 3, by using the light accumulation buffer. More information here: + * http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx + */ +#ifndef SMAA_PREDICATION +#define SMAA_PREDICATION 0 +#endif + +/** + * Threshold to be used in the additional predication buffer. + * + * Range: depends on the input, so you'll have to find the magic number that + * works for you. + */ +#ifndef SMAA_PREDICATION_THRESHOLD +#define SMAA_PREDICATION_THRESHOLD 0.01 +#endif + +/** + * How much to scale the global threshold used for luma or color edge + * detection when using predication. + * + * Range: [1, 5] + */ +#ifndef SMAA_PREDICATION_SCALE +#define SMAA_PREDICATION_SCALE 2.0 +#endif + +/** + * How much to locally decrease the threshold. + * + * Range: [0, 1] + */ +#ifndef SMAA_PREDICATION_STRENGTH +#define SMAA_PREDICATION_STRENGTH 0.4 +#endif + +/** + * Temporal reprojection allows to remove ghosting artifacts when using + * temporal supersampling. We use the CryEngine 3 method which also introduces + * velocity weighting. This feature is of extreme importance for totally + * removing ghosting. More information here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * Note that you'll need to setup a velocity buffer for enabling reprojection. + * For static geometry, saving the previous depth buffer is a viable + * alternative. + */ +#ifndef SMAA_REPROJECTION +#define SMAA_REPROJECTION 0 +#endif + +/** + * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to + * remove ghosting trails behind the moving object, which are not removed by + * just using reprojection. Using low values will exhibit ghosting, while using + * high values will disable temporal supersampling under motion. + * + * Behind the scenes, velocity weighting removes temporal supersampling when + * the velocity of the subsamples differs (meaning they are different objects). + * + * Range: [0, 80] + */ +#ifndef SMAA_REPROJECTION_WEIGHT_SCALE +#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0 +#endif + +/** + * On some compilers, discard cannot be used in vertex shaders. Thus, they need + * to be compiled separately. + */ +#ifndef SMAA_INCLUDE_VS +#define SMAA_INCLUDE_VS 1 +#endif +#ifndef SMAA_INCLUDE_PS +#define SMAA_INCLUDE_PS 1 +#endif + +//----------------------------------------------------------------------------- +// Texture Access Defines + +#ifndef SMAA_AREATEX_SELECT +#if defined(SMAA_HLSL_3) +#define SMAA_AREATEX_SELECT(sample) sample.ra +#else +#define SMAA_AREATEX_SELECT(sample) sample.rg +#endif +#endif + +#ifndef SMAA_SEARCHTEX_SELECT +#define SMAA_SEARCHTEX_SELECT(sample) sample.r +#endif + +#ifndef SMAA_DECODE_VELOCITY +#define SMAA_DECODE_VELOCITY(sample) sample.rg +#endif + +//----------------------------------------------------------------------------- +// Non-Configurable Defines + +#define SMAA_AREATEX_MAX_DISTANCE 16 +#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20 +#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0)) +#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0) +#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0) +#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0) +#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0) + +//----------------------------------------------------------------------------- +// Porting Functions + +#if defined(SMAA_HLSL_3) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0)) +#define SMAASample(tex, coord) tex2D(tex, coord) +#define SMAASamplePoint(tex, coord) tex2D(tex, coord) +#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#endif +#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1) +SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +#define SMAATexture2D(tex) Texture2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0) +#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset) +#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord) +#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord) +#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#define SMAATexture2DMS2(tex) Texture2DMS tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS + +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImg; +layout(binding = 3, set = 2) uniform sampler2D samplerBlend; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; + +void main() { + vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data; + vec2 pixCoord; + vec4 offset; + + SMAANeighborhoodBlendingVS(coord, offset); + + vec4 oColor = SMAANeighborhoodBlendingPS(coord, offset, inputImg, samplerBlend); + + imageStore(imgOutput, texelCoord, oColor); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.spv b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.spv new file mode 100644 index 0000000000000000000000000000000000000000..fa0208f25069dbd07bff6133f52792e1e769f681 GIT binary patch literal 8328 zcmaKw37D2u6~|wgjWv~3P+3Jl22?~4QD9`onK2NAQ7FstV;Ep;n2&~Gva&>Tt0!%< z#Wvel)T}hJGRth=7u(DB=dmo?+)C~Dd*>c`c^{wd)A9b#|19_1bMJk>p_<{d8?u^V z*{EzpcJ{C=A4g=vGvVm0KA&6HZd$u&&%m}tOHMgahi7Fqxt~6V;Zw(4i|nfO^l5ky zIUHW@h#aeDd9E5_hl6h+xDJnETpcy&w`pBd)4J+l*Sx{4i&qS89};-|$1-s|i=3mg>FDM> zr@Lpz&aKt{oz?2LGkd%Hw)OPwIDgZ;p{kdx$ZI?X-+}H+x~kRwRCjGQ4X(O<`=0Ir zYaE-+|3CRQ9L%?8@kuL`Jnx)m$a~x&dF!&d=v~#`YJW%fCBQsx9y9Ig+x9e9o2`V~ zUFomv%6)3Hrjp)V($|&r4JCckA$on*iN3C-V`KZ8&du%39c`%8QlC{0@@*~ct!*7S zUz1(Ttj#mOnmOM?vR%i#GspGWjm-I8===Jo`&pmeRNyyzMtPrWv;FYJecoKsZ!PJ! zm-IVI`hi3A`s^O`sAo+&aQAQ@d~pvCp!Zh0*x!i!z3k}iZ0)UXt@O6{4OY7<13lHg zc6(mOdwGJnC*cO(jlJ>i9}ebTb@!%)TxS|wTr~T8EP7Ah;KuGf)!w~kCAOY5LD{i>vLdlC6!qH&jDI)>^Wix3*(r6UJ70;XTF-8qzCC_Zz7FLFUoO(MTIBMW1J* zJ6btUIlqDW+vi!i<)A(jJ?!>Qdr{X8oyXh9qFr6PUug5&FYQHLyF3w`b~# zx_uJI`u0XW^0lM5U%!vz@zQT5R_)>m{I{NYGPxR<#qu-IM=;ANlZY6Z^FzL)!0O@a z8v4o4B+Hm&Glw?CHO4W!hUeoNUp*b1ej0w|h1B z;-p(o_%8(yE9hIm?jh_KfITO5<1R$9Y~QNIN%yX2*+b2tS91ND=J3ydaN!!M#C>c> z%ptFc!?rK!av_oD3b_6E&Ry0>pFQ8Z3O=6MpV94ywYt~8FuO1L6-oEq$!Y&~6neDBd@y<^oj?n7}jyrT1Sby`xo@San%T~pnW}(X| z)7XjM0Qc&8bHJ`G=YG`vE<`SBBAy?=}|c#7GxaDJG$VeE4_;?T}N zA#%I3^|^;yusLpHZNF3X=yJZD$MI43f|<+QV%PNyW8D$S=Ipwom`5WH*Bz7W&av)T zusLGgXMyEnUG%%k7@#9y1@jz>3E-gSLad1?LCnV%T7h&`|CH_F$P&i9sZ3q zHetVc{+nA~?7y|G!++;{KNd1CLiCR}UGee9XKKJGDcI!A5Gz}6P;&e~2zLrLG1bnn?X@G8W5*0YN1tVZM%2UjZJuXU_O;(eS6wszzEW~@Qv6i38a zueOM50lUwLI}0qQI3mt{Xo(m?i_Hjck9vR6l?ZxT-WkUv_%ij1s8j_5nWCx#(8$y zBCZ2m?A>|ja!N7IGu9TpJ0EQCtiznzqj#GV7kzjRy0vXVq7Tmn>*sJk`p8Ah^T5Vj zfJDsm!Nx@I^pPvSkLhrZqmlRy9h0!{V;bU{n8EDtk!N`kav{=*1h+MD73K}xnF-8Y z$VG@YeYLG8gSKsmcEw?x4(I624zRUE%ucYJ;wZ+9NxwloiT9i58ru=$?3;VhUhILj z%tsu@BGH363EP9?5}wUGw}9t??alGbelPUB7-=ZzR~Ph!;Oh#y{x=qMzZW+ZbiY;m z3%dR{m-Jf;`Uvpt1>Jmil=K4y-F)|y^!t+T8O#E|0CAu3271BvBfcNIz;cSiGue{P zzYlD@KE97`B<^Pwti1+_d%PQL{~8g`OI^RH<%Qs4E&b@lTK1sJDXwSksAT|bygubx z_QGkm_NZkLY%NO>Yf;xPYIzYjY8eH-1o6(qdM^g+BOmKU&t3wjU2$lSbzcg0-Qrum z6x|qywdf-kF)srfvjvI%z8q|fdoo5Jxj4tmz{cE=-t^^QxqZk1xO(PSAo5or{+?gO z?C|$o+m*^sQ)%8eavLdlE0P>$qO*`+Bf@ z@V*!`lKC}=eE7T;tdG43pVxuq$8b-Lc|BsEJWu1bNBkSW#(NJU{sypo_`DIUk7pJ> zZvxB58gB;28pdnCGxhl`U~ACFKKPB0i~Mf|muq+%oP7AaJ^2)Ccn6$(e2?D=_6-{A ztljne25GbQ3C!<8TrckHyAvnx9(-5E`K{7socHBDh_#3Q-je>llK%drTi-bF2M}`@ zr?2+d-v`0gR6Oesq05KQhm%k7tUm%LA3gXe*f{sE&Dp)|XLkMQ!7X6>qU~m8b@8pt z=7`_+kAbHm#_Oj&a@>|UlTXIyaK z1sm%K?t5T4{bJwW2buMgAy`gvn8)E9e_Q=1>5G{?3-cT2S>3~| zJ>KGv!8aj|h-;2y{s|(lK8yLM$lRp+-T4_}o;aJICr;jBE%E#M3$XF}c%Oa8*K@bb z^Gig#>si}G$2k*Mw0VEL%+esHn2-=G(3`z^Yh;`%j6)b=~@{fIv0+I|nG q-Su3{djEh} _quality; + set + { + _quality = value; + + _recreatePipelines = true; + } + } + + public void Dispose() + { + DeletePipelines(); + _samplerLinear?.Dispose(); + _outputTexture?.Dispose(); + _edgeOutputTexture?.Dispose(); + _blendOutputTexture?.Dispose(); + _areaTexture?.Dispose(); + _searchTexture?.Dispose(); + } + + private void RecreateShaders(int width, int height) + { + _recreatePipelines = false; + + DeletePipelines(); + _pipeline = new PipelineHelperShader(_renderer, _device); + + _pipeline.Initialize(); + + var edgeShader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaEdge.spv"); + var blendShader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaBlend.spv"); + var neighbourShader = EmbeddedResources.Read("Ryujinx.Graphics.Rdna3Vulkan/Effects/Shaders/SmaaNeighbour.spv"); + + var edgeResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + var blendResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 3) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 4) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + var neighbourResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 2) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 1) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 3) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + _samplerLinear = _renderer.CreateSampler(SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _specConstants = new SmaaConstants + { + Width = width, + Height = height, + QualityLow = Quality == 0 ? 1 : 0, + QualityMedium = Quality == 1 ? 1 : 0, + QualityHigh = Quality == 2 ? 1 : 0, + QualityUltra = Quality == 3 ? 1 : 0, + }; + + var specInfo = new SpecDescription( + (0, SpecConstType.Int32), + (1, SpecConstType.Int32), + (2, SpecConstType.Int32), + (3, SpecConstType.Int32), + (4, SpecConstType.Float32), + (5, SpecConstType.Float32)); + + _edgeProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(edgeShader, ShaderStage.Compute, TargetLanguage.Spirv), + }, edgeResourceLayout, new[] { specInfo }); + + _blendProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(blendShader, ShaderStage.Compute, TargetLanguage.Spirv), + }, blendResourceLayout, new[] { specInfo }); + + _neighbourProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(neighbourShader, ShaderStage.Compute, TargetLanguage.Spirv), + }, neighbourResourceLayout, new[] { specInfo }); + } + + public void DeletePipelines() + { + _pipeline?.Dispose(); + _edgeProgram?.Dispose(); + _blendProgram?.Dispose(); + _neighbourProgram?.Dispose(); + } + + private void Initialize() + { + var areaInfo = new TextureCreateInfo(AreaWidth, + AreaHeight, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8G8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + var searchInfo = new TextureCreateInfo(SearchWidth, + SearchHeight, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + var areaTexture = EmbeddedResources.ReadFileToRentedMemory("Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaAreaTexture.bin"); + var searchTexture = EmbeddedResources.ReadFileToRentedMemory("Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaSearchTexture.bin"); + + _areaTexture = _renderer.CreateTexture(areaInfo) as TextureView; + _searchTexture = _renderer.CreateTexture(searchInfo) as TextureView; + + _areaTexture.SetData(areaTexture); + _searchTexture.SetData(searchTexture); + } + + public TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height) + { + if (_recreatePipelines || _outputTexture == null || _outputTexture.Info.Width != view.Width || _outputTexture.Info.Height != view.Height) + { + RecreateShaders(view.Width, view.Height); + _outputTexture?.Dispose(); + _edgeOutputTexture?.Dispose(); + _blendOutputTexture?.Dispose(); + + _outputTexture = _renderer.CreateTexture(view.Info) as TextureView; + _edgeOutputTexture = _renderer.CreateTexture(view.Info) as TextureView; + _blendOutputTexture = _renderer.CreateTexture(view.Info) as TextureView; + } + + _pipeline.SetCommandBuffer(cbs); + + Clear(_edgeOutputTexture); + Clear(_blendOutputTexture); + + _renderer.Pipeline.TextureBarrier(); + + var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize); + var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize); + + // Edge pass + _pipeline.SetProgram(_edgeProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear); + _pipeline.Specialize(_specConstants); + + ReadOnlySpan resolutionBuffer = stackalloc float[] { view.Width, view.Height }; + int rangeSize = resolutionBuffer.Length * sizeof(float); + using var buffer = _renderer.BufferManager.ReserveOrCreate(_renderer, cbs, rangeSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, resolutionBuffer); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, buffer.Range) }); + _pipeline.SetImage(ShaderStage.Compute, 0, _edgeOutputTexture.GetView(FormatTable.ConvertRgba8SrgbToUnorm(view.Info.Format))); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + // Blend pass + _pipeline.SetProgram(_blendProgram); + _pipeline.Specialize(_specConstants); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, _edgeOutputTexture, _samplerLinear); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 3, _areaTexture, _samplerLinear); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 4, _searchTexture, _samplerLinear); + _pipeline.SetImage(ShaderStage.Compute, 0, _blendOutputTexture.GetView(FormatTable.ConvertRgba8SrgbToUnorm(view.Info.Format))); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + // Neighbour pass + _pipeline.SetProgram(_neighbourProgram); + _pipeline.Specialize(_specConstants); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 3, _blendOutputTexture, _samplerLinear); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear); + _pipeline.SetImage(ShaderStage.Compute, 0, _outputTexture.GetView(FormatTable.ConvertRgba8SrgbToUnorm(view.Info.Format))); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + + return _outputTexture; + } + + private void Clear(TextureView texture) + { + Span colorMasks = stackalloc uint[1]; + + colorMasks[0] = 0xf; + + Span> scissors = stackalloc Rectangle[1]; + + scissors[0] = new Rectangle(0, 0, texture.Width, texture.Height); + + _pipeline.SetRenderTarget(texture, (uint)texture.Width, (uint)texture.Height); + _pipeline.SetRenderTargetColorMasks(colorMasks); + _pipeline.SetScissors(scissors); + _pipeline.ClearRenderTargetColor(0, 0, 1, new ColorF(0f, 0f, 0f, 1f)); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaAreaTexture.bin b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaAreaTexture.bin new file mode 100644 index 0000000000000000000000000000000000000000..f4a7a1b417766c12bbac4e4bdc56796f18538bd6 GIT binary patch literal 179200 zcmdSChkqN_mHs{GL?MVqfW3FHfnW!V2!g%$UL;B)B~rcj*s?5HvMgIJaxaPFIB^oE z$4Q**W;dH`Nwy@L&2IMd`(NJY+?hck0nA9^T7EGfTOP1u^T`PkOdDsFqO$-AyZ^Es}aGe~|W&IcazaIQlpzkNAFWxdq_ zZ=fzv-`U`A>}>Kk`J1u%XzTF#TlkGzGW9t-GduG36t$POm$z58RkeBAYTCTmYTN3t z)weZZlY8AbP3LzV1VNSLr(%BKF%Or`?}%~6Ig;6**IU$6+U*Wh1gZkn0Z*W&)7x3w zS?jOstoPUZ8~hF2hITS>z|oT$$n4DN$nPlHQ@p3N-3^iz?Un6S?bYqojOVWLJIO6_ z6`Ud_Ac-B1c{s=0F49>gq;@57+A)$kl-Zxtm)}#^UEE#TRn}Dwl9hp~&T69B z37Xsy^@4uPJZSHAb*F)0wm;8b&{5b?yr*PO>7KG6Nx9dRXg6bs|#_uY3i95|5;ufGlALm9KNt9w1zW*@~m(LG@VN<hCOjw|L;Ia4^H_&;Ce9_w(~{4R5+lv}EUP~ftD(Ku_G5eYg*V8??@g~Sm2 zjwv@_!OKj6Qy@4RMv$TjRG8YmtNd_H?f`l*gIpVII3JPN7@t~bFY$Yf!ykg*Y2{JX zhGswcK*PLoHZiC{c)}sFW5y8S>^LIGe)X!BS0KD#h#)+npu%0`cLmox!EGRhjB{OF z6IagVaLID-im!5yb+~MPR}ff_DG#X+X!ns1G|rpC2-=}RiY8!4ZBZOnkp(Y7fwOv2 z;JA4-X_#3t#S%mlXYa1^yTF~|Ho1M=3>3JBtK~|$Ec9jcA(h4ciTFLn;Sa&@tm3%x zuxeeiMhXPMInzwylwidph$UbM72=OMq1b{4+^1P#3Y;+z!7(CeB|pe}!v0<5cM*N) zqp;uwZj9?j6shKl5JzlWLTqm09^&^Hhd&g*6Yzj!!F$OE!iGW641(5i>zHlKK1!Z2 z8T?Kvj*th01uy9qpulP4q$!M`jS-a1FB?7MgXp20M>8*DyMpa%*mi}R<&MLJT|gX; zwqFDlRq#6new(o11F+x~ z){kK}j7U5Q4e|uR?=*Ta6hYQBt5Dzq`hgThreMPqi3NgkuPZcLff1@^j1o*E%I-&R z`3QO}ClEExfa5yoCF1Tk`u-T)#J7dSLpnj3!F!co~ zjtB%9zvGHcdl5r;1yU3dy&xkf_qt*`@ozffLObpXN)o?sX&D>;$I<+~*G9USkeQ+y{!6V9)bx{7->?^UnPd zC)oK*#Q0y7ve$Bf2kZM!41tUGCJgzu_?Iak15-jq}jI*LV-Dm4c z95PNM%;*+0%bLCFHT412x^hE#NV%!pQXEknRUDH?kGmxIy4zZ8$K6FWdME+(^v2QO zTR_YWAOG9Af7=PR{~CEXdX3x8FSXy(nK|GXGfx{A5|(v)wQHIK>UH&o>X7O%NFKq) z(>%V{rUAox>E=3qe+vd8N+h`n`=gv7{H-YCe-Xz^?XTNY9LVf-4cbPNCK9KOv-VF?TUwL$BzBs*OudZaux>;yKe^Y9-fE0cInnDY zg$kn&;Dz?bIDz- zIvgWl?EF&uo7yWoiaK+;(|cV5jNq7gB5}${mOQ6lNLbJDz@I43_pr#Q1L zts2=iE8X3d3qz>8qWL|e?i^Ja0V~i!<03Pj^c*Q#$BVabE{mpHjJ*EEqKsFSZ z(w{tNCxW9%|oUY=UzgeBXFp)biNm!I70 zN;E%#@rylxCpm&ed}1HYB}naWZmVf`cNBK!c4c;_^}2}QFi#M6Oe!Qxh7Qjt_N%9L z14e&Rv#r)ym0Ffr1P$gEWtOJ9Q!Amun&dhg7|Q1-@46EFIL7ZW2Jnbayu+pT``WyS zAjST?Kz3IKY&eV{c|sz{Ecq-Ri_dC@3|)zBmIgkZiagab(aYSTXTiQ_pKAV8Pv{j--~PMv5Y2!)b^k z0>SJe`TXQvSF!<5_yfeR1`(tTeHgOfZ1RCUf(=s~8R7|^=Z-24s1`Mox?z2)$K?E#pD-TD%MTi7v@5DLJjB>!4=_%E<|%sM!CHKlJxHQQNl(k<$L(>s%1u69 zmu5&kp_)-HDwh>|6>Exvigm>X9z1S>+7XcL$Qd%uYL_wOPgmYj98-i}nW)mksPmG4 z|C9bePx46Ow0=&zqS?nD2C+v$N<0Ws&?BVd3ZCX^L35(cV(X8aGL@&N7u#wQn~fa_ zJ-R{7h-N}Pt)5lQE0>hZ%2g%N+^^^;>v4`GPU;ufmG|Q@RQQ$Y+WeKDG#{z!cV+fF zM$MDPISkulz!?ud@JLOyp*&2F)Ue_45hxy4@HQ)h-)e%-R+5=nwg@{ncG~a5Z9{Vpm?qAZ_@SH~9xwxUSUS|D^W^@_JK;f{%}g zA^-S@Q4DfCqLj_=pr$La##xk=RhZ&-c&znF&89Y^-_WJ+(e>*FwIjq)GpX+I(v=78 zW9-4zT-bvvy7Ic}knjyYX+Bcd?=0%c>~{^>&<^(aXo0n4Rf~s5f+C;+fheX?sE;ZD?^1C?52l*{FT{* zWwL|ESlK4k^4;0rS=yD;mo|_*BD7=HFdu9QJ%Zbh2f_zb2UW899Z`)ZbeL-$?z93B zOfPX&BztVN7J(s@7%+50iyifyrQJEbX@l&_?9pG?l~;pT-knzd-<_q?-Id>)*_Sfp z071)ydCD|noI^_%NeyJpyb?9C`5jZv=z2`eR*$nZy&xwiKcgho?W}Ut*y=2eNzLXK zQ@g1H6gwK|%Fuq)D=+Kt0F_-?VtO8zogIIApcFZ&>4Q?|@2m(Ec4zmd54eULqd|hR zMyL=B;Sc$iXl3)mgC@;@p*^YIUYSywkq?5I#i?bk3TL&=%NY8=u+7xbNLTL4?qgRT zv5Mave`T5s!CzTw`XASBFvOc4D8)~5`k=)5{Z)bD?mXxqbir;i}){3HPo-Z&SCZ>9yBEcOwAT=az$!MCL>ssR+dr@6~dBjbxDo5cj;)N zD|hAfvMb}ei@5R}=63N{ruN5~p2uNsYP(7mdwQT0KgsEX66g0<2THs0yGaKrgUN`* z!FIro$*T)3;Yo?y#DAmm+p8Ye^%&cd8f+e?JGEFKm|hAyu5?!0Vac|7;)g2-@_U3U z4@X>ihSxtB^6if~J&!|oAM1Y=idfSFrT9rsACx%1za~)DRoI;i9i$Do29rbWfG316 zoF}gihO+sc#)GCQZNI^v*lekFRJqF1iZb(abMrEb(%}iA!s=wQCb;PY~s2$?1a<=l6Tz$4d|ib4Y>xDIn+w5u8k^#9xJ061HA$riH9fSq9n@NFq(1amE3=n3C z1mOu8zpBo%F5$}XzF}9!dI*7FWNYcSi0OG8n?4XR|8Lv=ohOKRXXpIBZO8BUK;(X8 z&ZtKAg5s_MQebZe)&PMzvuV|O_~v1ufcDkVg6bN*|0mc zD6=p~Fa!K>3@>v_t%48 zIXqw?d2*(})WH!*yY?vZ(kRbTs%7`HB z%IL+A_Jdc(y^wI_u%?Fn7&g5x>IA*8{b5H%`!VX_q3M4#KP1Zf;?Vh#wZ(%7QVKgl z3<>IhdBG8Ts2!2q;^+?#niTsmRbgB=sP8s*Fa=T+sdTwn9N`IOceI3G8F#{>0@;;G z{XD^FZKn5v)B9pf&CO~KXTsGfL|FhXykY2hIGJem?E*z4pC{59L0CT z^hXbx_A3|EliFd#U_U)*N~%K?sdnN)6N@8-+3e-&f>uZ-z`;1_%SublJC zT?e!!H-FO?n*R#Z`{GW}lbrq+|8QCWD{+3g{AA~E`l9n+Nlnm;Ct&Q8JY3=fa`RBh z&foL}=f9E~pOe6c@k5FA#S$lwt3TQKo4)A$R}$lM63qv^`0IbAj+e_%cK)U>I{%f# z_?%SxrPlXK9WR%k?EFn%bp9)e@j0pXORevfI$kb6+4-Bk==@g_<8xB&4;~(CeX+y| z5I>Q6?c3te(UM}Ez03s_yA!1GsxH9S!7U@mA~nW z&VLm(f#2hf|AhtscZ;9w{7t)?|0@2UPXdZDfcLoLf5;m}>x(5`^7Z!+`I~k(|5f}J z(D=yz(fA)fz!?AXa`{E(Z$dvhcK)VQirvnC6|V`zFXjT?#~uHZ&VVQP{_UaiH|>7@ ztJnl3#^+)m$nSZ~!;$}w6v(SE^zY96#aHnqpGF47EhL0N7F5bYDP#c@ag>*M>Iw7JH_;#=3?BgPr}IeVx6XJ^mhlH#VvS zBGdwzxmGz_JJitE)YIJM3-|&powNnZ6VTViwl}T~&GrrT^mX-gbp-;Qfll`F^REtn zaEtM0hu+TSG@)M}{-+i0@xB1L`un=`bpF25xr&ME(VF4f!MgtXzJ}h$-o~D$ZrYl= z*cR~RR~6QkHB>fvn!UbSUtLQBwnm|j3SUWuucdOXc*QeaH{LkXG}t`g>+|)t^tSY1 z>u&9C?P6P?l{QHtUgUFnj9L0`a4V1X1<2+13&l(N^VUt*zKq43+5D-(@shE!5%+NU zP~~9NKy|;T&(l}a>+SL873P-}RhD_my_L1q^`3^B25)0+V_j2yQ+;y-w#Md0UlX>A zX&VJAW%HF&o(b<*?MU5F{b0jD<3MA7Qy;e8<{oU_zHXl+zb38{<0g;%`Zjk5YoU~n ze7xM}AHU_kro64YXgX%yNZyySoW77bn=_p^nLklDRy0~NTsl-XeI?6c?41xJxVCRpr$co~oK^Z?%_E zv#PCk*ok+W8?QoL9?S3W}wt0z2THKR2nBFDM`c!?Oz4!zyVH6jyMChl1u z>Gf0YTigvK!Yg^CqdD2}_usgma$n`{t8Qp7>(3aEnGahJ+Sig-T+69MaVB#*dn%ha z;#H7eR9I3}3WDYCit?(8>WbEBSevLlx$WTawmGYLM{b&T#xYjIrly8RjhOD<#He4X!-p9$o&%E)LV)dRL^KG zC!95$Fl{9rvL3XpIaZy^uBFt4w7K+|jOi@!%g=`b3rmUxg6@h6o?$i5(BrM~dc9Zd zXHt%49LheBx36Fo1Q$x@%4T?mL=g-}JtN>K*{C*-EZB`G;*ggyJMxUb;K+iXLKJD` zG9q`$`c1C={)ziL?)y;SYs%+U*ELsk=k%wH#}W@GZSV|}SDec!i>dQzj9)=PVPSDm zDI@51S3rf8#IUlus=8WG;uZT@*YWhttc{!ld3!-{33dz>y2BWfJ$ob@LHruHDkv}= zUQqgmW7`Qo=YGO{#Jzw?z~lHDQn&4u{#EY%+kbNZfGd5>z0KWM-c~&gg6H+8NrmP^ zV0gf`W?yxHp$jQ>@?pV+1x1C$#k>aHZrCwY7-9&DS8V5;CsL0@48e~qm{(wkAXHdZQ69z+{E|nU6qt#ebBUZ*@|ds>{F3___g(H??m0FMy#px) zAFc?HTz>z>{S!Rk54dk|?{KdwUQpg7g4zoSXJE(2OJ780;VtAvfb|kWAVZ_Ls=z_ZRM8xWDCo46T2adlNQ%7X+Uog1WP0$D~4H zXc76rg5d+14TlkAcFY)FwOvR)<2sRcB>iwEq6oa;zWmkv)k65e;>8j{h0tNxpBnu~ zo?injxE$FOvth$lL}J%f_#H}9le;7+!EIZFPMjiNOE8D`TZUdgtYz<)c!#6 zqVkUNhMEXo)SXQ@tv_izX4*0xHV65U1ydA(4HuQ*g&rX^$VU?p0sj zhX?`-<`qabOmQTfAXK>H58nO;>5n1^S#SkzBl6Mo6nMczc?5~yPjPScby)Con2m54 ziOu^E&`Z%Bl52U6Zcm$FqpcvtZ|#_NCT_EMe-K+Q^14Uc zzn%BC-+e}%3!bwc-vC`<$2EF%Nl8$(7ak(&C*<<`AMQU8fq#u?^#ktP__p3f6nPmo z{5%L==Lu@h!xIMiQ6FZf0)qsvS})npA%ZXqW(s6cM2sW(tA)5T#)jr5e{7wr8tH3zg{_ViF{r(;bVdQaoBhHZzMpaN6U4uB$ zRa(Bz5kd55^em?~eWC`js$Td+v)uc)|KW(=KfnY26nEU8a3A8X@h$WOUxF9Bt$a2} zP~=AuB&a|>ibN2+#`8l2IhhK6=tq+cQxriQ;Uh76geyhC=33#`3K1d1zh7+QAhcJ4 z);VMB51t_U51%4#%;MI?oS-zrZ{{E~JYB=XID*m-;Hi#)U?v50qMVJtq7J&BQTVS8}VXF{Li8CcQenDx(5hd8RwFEUPT56k7>4@j$76SM1%crZg{|CDYB0 zFUc-uTTymVPEk&wutl^v{BJL3Dyf0`TG3K|{4d)6h!gDmDOXN+CvSl80tlAkYr+795_dRu_L)4cu%V=)uq3}Izc9Z*peT?enqjZc zxfc|t)T;?o#=*p%B)_%I=Ce0B8j|aswN9_ACdHFdomz!WikrCqn040LZ|_LE-F=8Sad-jIzv9Y$e#XwOEh4ixkkxW36*TiIL-fJNIup!S-J%9=?r7 z$cqRHIh<2%PSBZP$;2mfq_@P+j{imOk8*D6K27w|tx;!c2Z4Lg+zIiyKG}u_Q@xSo> z!Gogz!1H+rwsi_N+>M}`!=#98Em5d;6QyD{^Ae0!U#XlnQeg}QN7rAHPO;2Ki!2uNQS#_xXs1-_1Pit9KNO%hs1M%fi&sF_0XnNeg$9W;o1r2f5V zJ!w4*1uj`HGINf1G-@jr3@jsqj#;K#2% zMvT3QoRRyOg;s)1nlf2q(V6u8_`H-r@i*TFVB!Q@NJf8ockKEeIyJt*)9Yse%!7 zs;%14_`(}#_f^>N8dTbk%hn+p=W{I$jo#X7Pi0kkMOk@BgSg=*9kLhG;h(vGgpz5T z;w|nz;`!4ccoq|dwxGiOs#Wz8uS1QqgJ8VzKiV(x;`w|Rqr$Ipμw<5_SoZr#d3FkR_X{|a|ppP>)nXr zKe~z%eto!bJvg^EHrCd8m;%cy+!*2%tQdA2vSiruf8p-!R~V`N7L@!tyx=VmL`_i? zt3AS0xCU=X45he<`!8G1!UN(dvUM+Pcn*CT(jX(qROp8{Y=pANQSnxU&xME#IZ+Z&ZK8PSZA)*PXki4M;KaTNxtnokM^8-YPcLjYNh7FG(j?}}$ zWkWL*VSdEg{T7~*JPY4{Tqrf%i;K55HPzR9z121311n&|Zg;60k9_dr2?`AV0)Btb zp0#`o1>(LJ-_}h=5Op!hj(HU-rMQXviQh?B@FD8~roef`5vqebf*#=j*)gxehk2AE ziTeNI13bT4C=idgEAWVso>7Pdal0?P3je~!??1;k!H@nuz<2R{kl-0+$D7DSEb@{1 z_pm?iXNX2S@QC+z$G2S4(FWWi8i6|><8g6LJ37W3oxf5&>`H2(Zm zj9NT^$9(~JJ~x;KFQPwliWBuE_3src@gono$rQMcDR7>}kqPDpQEVOr<9+vEhJCHg zzD9~5wG=}{1@c}nq(KlQel&hhPy4=yUd-1KMc#xBzl1pQ9D3$t#TOBcrMQXviQif9 zI|d8JJT543FM5K!7o5U99Yqo%_+?-e?{5Gx)1cH|yg5O2(I|UC&7L3_5R#G6@FnPfc!FYfBmtkM4&xZ(tDWS}Q z5kt_AVK3ebx-oTy`ZD16C-h)`3lB)Wp-%+`z6Be88D5YGUS}E<^(FQ173)RxU{1pW z@)jIYAlWbx9A_H*GBAqwHxR$3`Uc*DDTeSqQ0&PB`_shl4|oFnYxcbPd+5b{13r+R zV!jH3FM!}pMo@~IxIe^?x8QZ_ekgDiHoSoQIX;dM!FYfBmto)Qu*ZLZW&Rud+n>1q z8k z-@#bTk6_ck07q)KNK!7pJX@D~75$i-=v}`9hTlT0`5`F&9Qx+Z_xJ3(xu$~ z>li1yf&07Hai8*aoR9xjegbv>Lg0AKa990?;;Y#AefW-_gYB=GMt{dNN(v<(@~>ZZ zd|Y3~jCH@^xb}kTY2|J7h9BU*;sdn#n`rZ=j3TxA=d$_bx*Cmx>Se_idehIKZ~ZF1 zllO7Hj}fsyh1z}uTA$&3*NnF{uP8~Op9p?~^!e+c-bG9QvIFeNnM__WAJU)HTm->e zu;W)?$?t;Uhpf%tMeE3-W!rB`U1E=RO0{2c90aeUUvnQSd`DpT36%XkM)8{IIqiMb z8;bX#cOv*BX5-{Dc!uOHNsC|hS9kts%7S&jaVz1p<`M|rgdM*G)x0S%{03S_%8|=2 zH?1bgpD?OkRBm84=Oqxl4Lg2K_)bWNWX+#)*Aj2(?x|l>z6~4x2CncucuC&ADSi<{ zKH^kiWy>4b&-+la{?FkrY5Nu1e8yw&5udWiM?N-k@2@2BzlmoacVQzR;aeqdwfzh| zx@)i=ii7;#@N3gw&{a5`K4V`sZy1j0&N70}D(@&>M4$Kp&PqD`8uN&9`Q@coTiXl+ zniG$P>H^Rg((eh9wh2@Y6pJ3w{6v^8S|hkNn@rM;T&BFAY|h9zi^M6%@(ZDZ2d- ziu(!L!<8#-d3dIH5#Q08r_6uMWdk`ADf9NdN$bWV`coiynGw9DxXV=d8Z7x8{M~Z- zhEc66xHGqR^L_x2DGK$*Ts%?Sz;*86J9-CuKW3E4(@MSG!e2IyNl8ty&hOk|ys@;I}HDxXI*xzF1C zmTZ2x85NEObBCc{H=$lot|<4hbqz%$tApk<0lq+2{{#OyzFLJpqzEMgWA?Y+;uE&rW}$H)5jBb@0t z=qY2JzY&j-^qC;IfU{r2H$=TWzDMy1Jb7n%U;b#uv}?h(H|d~h({L=|Oc=pCiWd|w z!MnUFn_pf=d2)SHo3U3ns+m=q7#MMaD8Kvw z%IDYrDllCw%K2$^Y^nycC)@Gdd;=|c8Vp~?H^hH`ot3?X!&#H5bB-13n)#4%3tsSS zkl?fA31P?gW%J9;bUSM;t)?#hpmq`p+zT5%f;OFJ8hjS&xC=XenfMuRCEV4%q<)1d z@IBaYC_0P&?Xzty&cIXuUaT-UD8%;XI5e3QlA<#?G&z`oHV73ShS$A{Co^~PcT2s( z*Z(RoD`?00X?9#bXH3HE$#y*TU4vRLp=V3|GK#SNs@|fZobj|-=OV8_<^|7cFKDj7 zimxl5#oXMxihBy#{x7r4>9zVyov`3BC~#T1UvUUt@U%ei1_<7PIG}^ScL*NOk1$w)4Fp( zf;W`6;0fjO%grit)!3TMdknq05zQ28U9Q50H}S2V!BaYlB+nq0-Qlh!T{qs;-_gCm zEcgxOTZ(rPhd*R-g#5~<=uJ>8+4*up3C(!=KZSV|hdIn&^C7qtb1CXXbAk&5X-e-X zRJe|7O4o1Qx&C+C`Dt}*6&|D;6QIsY5FCalJjl-d6=>Y==_wt^8_k?bolRbb0uLk} zh7BK2I1@qercyq?(iD%aF{zy_ST_y@E-BYw!+fpZ5JB)W-PGUKy}&G3P#|oWyx=zx zJw<{$`B9S1LKsyiW(mv@KXLx4O=q&`QdBvZ;n)s4p218E@RfK)TmQT5{62gog_tX+ zOoGhm+(YOuQT5`}uw=idyR5%pBzuA^*iH&uH*La(PwIGrSD6*xcqo3=_J*WZQven` z3I#5pA44{L1on0o7DpPqPWm%F3x0Q*1;3_zL-{uPF%*Zt9zigA^KF>CTMg6gE(u0I_{@S*r6*I9ju9mZ~0@EB9z3T$|TdBIcg>zANGfgd8sJq^X+2ciO*4S&S$ zlfMlsj{Qo)Nca!ERqIl-YxQ)c6hMtDN0SM2q_!ue@_a3jQCh}q;O z1!kOP;H#>|W&5D-1#T(ri1TFf8OJ&MdHV(XMcYN&CAM9*UABg{E7o5sKh!?Zzht;; zylr|e@mAuqiO-sEV0#AJ^(20KI_c>oFP=gk!*jKBn4fVG+a+w`d9GMP+cm4aEnfSn zk`o$2D;VlvvPA&uP?`0{q=aNf&<){8z2dF^70%D+S15(PZJbsM8D=9O*D-=)+-Si{ z_QCYc)T1fKU8kIyi^NRMq?q0%O{qy=e zhTDecjJL4eG~F~k%eH)bH=drY;hFdmJlj2qC%k9yocA2#c^=QqX(O)k=v|X|{)QE% z;q!NMp%IHF(P*&foa%IFPLGbC9WW2;Jy1`L7IEc5-SxP1FV|l- zQ?Qb=KWii7aQYFVm~zr}5+qLxB#9>R{11$=WA>!ttIBs&Z>V1*irN=-_jGp??k0ew z{j>JsOl()xXtK|*2HnM``C zmM18!KZ*6fA$=FZbgi6D1shI99I4<5R!)@67p&y1WgpBs1cqB!4S6*6c*+T^OA9CY zXN+g^tIt1FypLGx2x~Gd5KyY8qf$Vjxg5((z#bZ20M)JS#)a-9@NBVvA z%|23mRq-BU_`2#f^(!Jp?LF;1U0!-svdL((G6pZpyZ_;bV&n!WKc_jRc7eV(E64HX!w zUWFF(GTp9PhtJlDxqHKy!!nK87)!8Zo*`NDA=vZguGud#%l% zAS!|*8qY$7i%{Vz<_M83AKX2D6614H?U!2LD|LKlL$7zJdW;lUI#;|HPB3K2P$ED7 z>L2k`>{qbipI~g{Q`{YV3!d=nOoe30#4tCjB&`w_ObYD8N`wJ{;0&S(sgNw0819<> zB*y2Y+Ap=fSL*oA#@^aN&q(D&`7~^Jo(SecgS;mUv*iE5T$F#s{EB~o0_mPRMDXLF z3f}|6yzJt%3YRyz(cWV3KpgHxUuFdJ&?XT}<`7L55l@KWuJPl?Ph*bHNwr^UeXrE< zosB(p17yJ!6YiNZ*l_V;;WmQIlK%>R|BgP)@9<=n?s+~#91%6h2=Xda`nmvSThcyC}n*S_KPkaQL7=vf&=Yks-l~r{4~z^ zoK*Xz*7r&s?{Dg=?;{UbJ;p1LdNNFdB0<(CWc>aJ4@eeF3gm70C%7jL5hOd#1HZD= zN>>dmm{(vYY?$H*PjCX!nCy5L^%r-1POAO<_(AOTy;8^fo4Xo%$$~ww;7am=WWx~z zSuFWq%m)1v`p{&-{~#!kZ1~692MqkmvXB;71nx_ksd{CJ_9dV8!3!a&rnZOTn+2 zS#UFaV7tvPc)@UjyT*?nKdr$~YSi&LsrE~)@0B_p{1Abe2P6xI0@07*domFOi62Ga zKcEluJ3)cuJ3d2ChT;fM@Y@2v@)USLX2DP(`Y~j~A+$3?($TE+cD{GD>^ ze-v-jjMeuyq9_8@_28>|w1mp6P#Q9}b(o#S`JS@6%0^Fy_HNq&!6z~eE?hsftwQdm(^?XIb;t*Z0XhgJ0u z>w5V3x*iC~ca$e}2TY6BndFI-(e&ZWp`3xd{`|he-lE=;p3VG;26#D0eV# zprF67uc$ZqWctCJm4f-=>C%bvv5L{E;p!pJV9h{HzqhZpudWvxV=2k+F$;Lm@XceC z50S4ws!CB>>@GoN6sk#4Db)24>rsFs7ID@$AUuBusb!y0o!1`MA2RMYuUeMubICKV z>D0-z@r<#|(d?1zk=$X&nbgg!{dp?|3&pdg(`A$8;}v6-BcM3!A&&iCY_$k$lKf}@ z^f3x}Q0MzG%7@73Csw5>1HlS+h#~5Fu!&8XNqa5J zwgtzWa~2e*(k9Z!Ge+&_P`+X#dkq8^i{?sZ7{iH*@ybz99PtoE#!-@A_yj)55AXay zWct9J`=#EIo4*sO>Tl^=TW}mFcO~bV+2WsGiB56iE?5%RykTVQjHF&XISd* zM+fS$3V4L3Z#`1^5c&F}x-wK31zBKOE_Sgd1<#NuQk5PrZjyhEo4F4#N%uAgUecV^ zozNdK95Nj=6T?;8l6{esXuse@;#ex6lCu|SP8W+7ilIW>`b-f+bmWO5lqkt>=L9~< zL&85m{36o_hVPaBS#JJfr1BK2Qcym7KAXK*lOn{h1~)0c!Jxu7Ftzn1#VtngobFV@ zQNyM&#BjyFi1GoaP`=`DCKX@U3xZ2Ui^cOyg&~F$m1B}Eh&F*w@}STMqNNWE?UnxN zVfh8CQt$+M6^1JGRO2S)ceo{?2_+BU1#j{MwWkw~>9>rVrj5h{(BVGl@M7{Al&?VX zfGi{|4ibbNGZh9IO13~8K#rWiCwZ9o0si*+k;{j`e(5HH|9|M;(2a%o6B6_C8J$B_ zDL}B4a!gQt9!iS4s>_=5+B3|Kw~U8P>tMLgb}9L6 z%1Kr{U;{S1mPZd=mI`UuXCBdnEP0x2S!`#_fB6CAM=szIKKy{)7ThUUe=La4pi!>@?Yg%RNO{#ZIR&VgyV=NP@(OT5>}3R(nH3AsB&?~?e~dqYFZwZI@ACX&r4J1IV2mFh zQh(bN$f{C=2ttJz=Hd&5P|Cayp+F>gQlQKQBX~u_6Fj9qZo7>1?PtQuSFi#q6i2X$ z1ktC7@n?tL4h_IRQUMR*_ZaB|A5wqJ1MaL!5hCaYKPnXRD~u!XiP}(P05<$QmeY|2 z89{i$(^ld~_tfwm1ujbv;;y6K4VN zL+LLVgM&6KRs;wV1V1ViLKC&Vk0~;wz}I2J_Yg;(RbCeeB9?&P1;#H_zJf2lz(?ZM z{1x7c(dhUKfTb4jMx_>d)1^}ugJ8@B#HgMUcLDH2^CJu96&P8S0tBf-6cXY55&XnN zZQp=CAb3@CQG1@}hX}$7&`=RBA&Q8R7zA-w!Y|;6w*Xjj0dI&^w4cg3 zn-VjdV=f>rqUtGe768k>|78IL_ABrUQy@|-vKJ%B`28~?2tU>GTUZeD9-_!A=*gg5 z68bWvK@bGLilVcvbr1;BFoiFHwkUkn8j#4^TDEEGZ({1HUC9N!5og%(3qSqMtkl{-FvE{{npZ=D@nRFTl2<})wEKDz?C>C`AaM@n#dEIT*>sU4=rUIgLN<@-R zfF&fXvW=a@EDCMNT`t!Eta#(s1xd3?k_}}*ZJ5CV!MsWEsXE3UrAh>M- zad3KJj0M1zbFNL(MeR-1eZ^ZyjPWsgz2C?96D1$uliBd;s(zQ;m9VOmPHXi$x%><- zsozydr@#7RWNX=PkqdNpj{imOk8*S?vPBhy}#K<-Z(C_e5C$?3u{e zXFYDXq`jefLHQcSOemcVr3m=}C{h9fApw@F)i|YIS4gF``n6ns#=Dv~lp)Q>N`Lil z;cX569KJv|+W24i{@_8;e+Z2~+hD^wi;8M+g)JZsn_fsZykh~dcQj`)dBc2Ce_8X4 z>UrgT1)tc0l8EuiFodKrDUFGP+GW+2B7Rz{&*kzn-PXRMdPgCZ{_6M9Ta6xtxBxi% z_+My$j1!1&i{sB&KFpbw27}hBO2q=f-~wXWFHA3_sz*B(0M`!Z&!+6NY#GlaT-IDy z-B#XHh{>BMiP*QHL|RDy`X z3v^?R|B3r|92ofqzdTh~KFlR*^d_yH5fl~>6F;i6z?T1Vs5!E20dU=5(Ny}fecgP_ za8`Fk^R()g@&%|+Ox{E(F<1hubg#v47}YGQHWcyFT76G8zgr3SG}7s>ejC}^h~M}H zy0ORqgay12$L#oJ+wx)9aDrLmP*Qy=wtzTd`7ejEQQH;(*Y}r=PjZF(V9-HKQM+|XAxmNT2O zVn3L)Wjv)nue%BjB83?2_`X6+-UNmjRd!!ukA6(Es6L=Ps)&`=>Kk(TnOXWPHPT-x zKTzIA{Pk;4yJ6jy_P9peG1dg+SH{$aMUxfGMyGm@0eb= zZ2@p&uX`kKI&Co-OP&*t8crvi*Iv~;rM>}A_ySZ&DSqC<2*7(t%hY5J7>1$1cxkP^ zDx2R8!yVm=npdH~c@Hzai!k=9DHpk7lRLVCn#X|3LU z2!6rzSBN5fk~}`*QzAey{greAy6rr_;NP2iDu)UuvSw449c$JN^Oo_X{)|o}h{QQc zB*20SA7qw0>n&}jUj3++rL|ISDWavddQ&#PXN=DwebasQYfOP3zzar8fAxzm7{^0j zuBp3fuxKm?7VKPw0uPyv!V8|&T@*BU3!aed_&%kzvLdaOVNf>-1+uhOh$Gw6TFFi{ zCZ@mAu=H1mBGJ=d{p_I|^aUT#)LlJLGMYP;G3SB;EgR-d({b4FS=}W@@Ve?I%KO5O zk(SA8^O=KbtyKGA!#mSjy)0LMNZ)iz|2#ZkF#Q#?;qB?KwkHt#g6T+KuDRROUpkyO znK_fXl)P#?kaQ?<%XmV6I^i4$UPC0gj%a*a`8+M_wKXN}fd!A~rZn@2A_viv*`C(w z1^N7tz6t5Cm<1#K6`}~)Fr`rpC2#r>;_;XK)!gOjD;p{p&z??OaIVzCW3PS~y%;1`QzNYvY&blv)pgnYSo$lLzUjVNNPmSW!rO2t{nhuLFdoUv`vTsc z^1h-{^bIi!o+-AUUHr*FC{N6 zFCAM3+cNVq^Rj}Q#cR)ZvHcmr_p|fFEhkUho-iKC%hwgU^NWKOfTxOHxZLE0%6+liizK<*abIQ_50H z(@N5c)3Id~(Uw`rZ;}mo!XIcXt1K#`>gQBJKBD%ysJw^dheGqP=2hjCdQjVw;5W3H ze2GowhNOB+t<`I*v3cy(j;iD;%Q4HWrQh0tGV~3O+Tk7@>V zJ^Fy5!`NnOHTldpNA;2+ULYlikr9}v)6<|^W2tVzj9GEqaM=?>-u%Q`YuDK zVUMXD99t57mUEUvmSxMN1qI}-owjyc3m7&!>K%2USmX4#h$BcoVLXzT_jzk8tKAhu zaC-&$uYp=$sQD|_osOt}ULc>}wnFoWCI^(O z$_4eTdQvl{8PV|+yDb+j$1UrKB6Ex&3dlo+9X7t`G%1m%So?(WNM63BuEFE2tfuPS z0>P*iLGU|iISd6ZS!OKb&>)pt4OsoQ zJpw~gA}A(d5ZH?Zr!AP}*d}c4n2aKBMxRSZxgeX_Cw6{D3cxJRJBY0pxl*H@U)PhA zL>XfnV_c!7p`ix#&nr;>oUeCT%qrFf>rNx4U@MT%FG`_#%&$A7K%sf%vT7bWoYG8a z#w?dCrz~5RgO*j;@H7Yt1>~(LARo?E_#5$n`#4$vvjpL3fi*3q6Ota)a;RDC; z&q(oW#lzkvupPsRZt3jrcwA*ixfU6uL)-_!I72G&o3kh z)U%wObcGr`53U!k(2V-=O|^AYyOt`~Qq4=A;I``Lg@sbw#Qj*m5TnpMheGpE;Tq4- za@leg9`LZ`fMqXicn*CT(x6a49xC)hhhQiIuwn{XA(FU?f4|5GGoN0@?8n>$vsp<~ z2a4HeJd7i?QtCAb?UitPFpP2jumpjWIW`-AMJb*KSA!#cSFC1< z=|s_uYf$}eRR5*Qp-h4Bf@N+jZ^NrNSV0~=Ln&_JezyK2PN8|&F;gMqcM=wSNKoKB z;>aW;7%U)9DhzLb_}^ZRR(psKtrfe6RO)^%N8>OU6V*0YS{AIi2rVoRKRoKse)zpg zXb+i+k(oHeCV2iG2?CW!Vr*yE$OA>8{y8j|DX;=@I7CqNgixW#N9tej`&Z0;i&JPG zcD$+B0KfByAjd45Oo6LV;M@*^-PW$iM(dG4I0dcEV|3h5B4#*|g;BFO#Vx^7x&LptaTksmA=3an(+yCVtmp0HSoo48+C{}H#) zJlV133izFd2i!soAsm z21BAI30fkCQZJg{QU^=S#^?3;aFp|t1;2(kyunSN+f)fB>%cr1g%%90T(EvT#Srp= zV(n?silM<$EY=eFNd3#!f5<2_4}KRAfluHrP=9g$^z0-FX9)u{&(B?#ra)P69nprUt=SAK#C!} z0;3Qtko$@q;x0l$Bc7X zh2|N*v$zL21`9sK6o@Pdf)|{^Jsrhj5Daf?=w~f_y#;2JLj0(GUqd;ft0whE8{|l@p{U&Cy+(F@pO_W?1f)#t=Ie}&H<3Sa*lJU>4_DQ1E|q5e75kLN9zSDk@h({B{K_f*bdu@+s|C^w$j6jMq%p64_?HCT>aB#LaR|-WHo5O{Yb~~7xT7X zMLDN)C>u3{!GLCT{j!zmm{BkBBK^_)t7z+gh4cRs5$gL;_&e|sH!x#m3#EG|Q9h-e zCZ^V`6mDc6$vBaEGUc@Etn-}nT=Mzk3&|H87ae@zk4q^2fz5@o9*B~MU~R|YwN4>+ zo&`3pTuI9G(Io+*rDzUrhWgJJ@s0ozrX{1gG=I}|pcz&_g|FZK_g!&<7MNmeEJKR@L zHfjTv!IEe)p16BMp%w03#T&|3RQJ{QGz+%vm2gXc6Wg-}Y`en` zaQFm>KY)jSSJ1~{*zgGANIh=#vL*0AL=x+NB@zTSj#ti=tQ4%}t!E#~+RQkTel+cP z8mW=CQ?4`6BqN#bP42LvtOxx23K$+lpM$4(OyJ0qJdNnf3a!AN-T}kc7{wPg#1SNQ zM3PuOw_E(A#^1NW&5 zu;jf0LsDXpBFn!KEVKd@zNetV0Ed8lwjU`W<{h^NT%M?@4_ z;XXiVMxNn=5XD{NCow)J)qbh%^CcmI6iKUtstKtQmz?RIltBKZ)@qkx=_3X4X);sCp?gIFgt=_R%u$btKLCH6@;=L=;6*ng^N((D()S~AHpLB@87l+ zT7e4RQ@o>Kg;rqA4|a{8#Q2<4`=!?RN*xb=gWl0t z!$MgP5S$TINR~_t_p|lqQ46gIDijK>>>59b@j1!%WAqB;ePgcgl{~(uaiDg@Gf_EJ z&TNw9JJ>uH1sgasppR8Ara^F5hxg7Ac)vL3KtR#X8|gfDI+ zl=a{Vaxx38>>|Io<8xB&ms;N|b$n}}nfQ?hBn##hDB3XYiUmKoO8ih5N+|0gD6kVY z99Gr?BgT^gKgmKXyUH)l_?%SxrPlXK9p4)8Q3NIr$XhV4K)xqK8eE~0g!#k|Wj#SvSp0&VC=XmF^f6(9W@ALQid;H!0F17{yv~|)pc&&J-s@of=M?H!r zKh>e2Dipqsmf*(M33=gK&rqPdGtfy#`TZTSHpvE5agxs{@(3u5zzR$RnmU_0oBg!;#7*DnY|m~hY%6UmZ>wsnZmVhYw$--Pwbf&5 zz}CpNrnWiHNaH|rudk=2yQQnOt2NNtiA~Iw9Qrr@Uyr?mxP4Gcek$e%u3@MjPdOg> zC70i8n$t;ZuKCQ#+_8e;;=$5>cV9(cWp7olr^nM>1HpL%wVkz{b^1mH^pG(N@`Ah-UPH&EZ-(BIhC z)YH_>ZzvHIVkwDb4g#vjRG~+Fy~aI*P*e8EUw~ZweaPKbo!1>P9k8t07o0PxQ|S{~ z2JRo1@j$lIF|sWkH2`&H2qd zLq@TMr?^x!S2kTSQ8fmFBi^C90b*FkD>0m+WV4nqd$%7ZSbzyJkMMd64?QpAac&Fc z>3oEjTz=og6u4)V7f}*r69h3;1qD%Nozp2`m_Cs?o`te9tUQmt+tiWNZfi|$VFa7A zmF27Ikc>j*#n(CbPM8aX?K@>z;wW2VOV}=-}f?>u)<`^g%x=ns_ zyA^ARoGmG&K}Il#ESVTK6*U(#pUC*Rrz(O3Yes5=D$$RgRGWw@^it_KP%#e_D= z`9j;z_cjs)-Npj?WrU&fM{ojUy`j1b{BNP&L+JN!WBSKE#SP^p^%?E4ghNEo%*yiE zXOnq`Oo;cFbEdeB$=@?)ZHf zvkj)W0o-ZSz=mBk6#p=^ImB-U<`;kh%Q>1`OVe;=zutjey~JJTPC|kG2;_MW=|<)9<0m-X zQCx=t&+-HfAegw1ibJ7H*(?}N!7nALE13nUzX{XO*rk$oUNxvlRG-;dmZ~1=jxws%eg1C?<#}Xsl=aEQW z+yk87ec$t*d(OL<_F@Kp2N0dD_&$%Y6yHMFLJ$$k@~K$(eS}=Ri1X?x27>i?lQ-Ri zA(=PGm6mBm|F%H9-o6pLa|j0U^jdrbLZ`N{fY{(&Or= zlm}22WL^S0b{cjhNeZ*zCl+`bzuk?55)Gk%jm%{5OwNv=@cS6Kc!eOQ^GM)!8tEl2 zCYYrm%AVhM3Dx_6JN2sN4ib3X@s#tj>tfDXgJ2#6kwW-Q!cT1QNL&J=rO5<`841y2 z;ioK!jDdy(b|mbGA%*7gzcUQ{2N9f|Xz&CQ*pC86P{AUU&}m8^r$Mp6SVlDXG7>10 z@ewAIb=(4v`Q=&L-yf1*?i)zDXz#w;Wu&>L(o0X0nw(aR62Yp|m&R5_kh0zKR_=pCBkBVS=B`VbmDN zJSJ8W9|h+L>im=ifuNd4$L&ZeLAvjP;3PAYJ@i2+9*XxvXEjAA>jrghZ`$)F{Pt=k zb~Ju1Q3<=$T{@8Z2{TNIFv&5*a!2AD$afHu9NF{xV_XNs*1x5_W_d|TAUez_h$YTB z6W8JsUihtJ1~}#$^D7OGOJKA#m0(n14fq8BFGhF zZUX_4eLTTUq&;uK?+%RxEK15pF&Bpef;@G>8O6L20zWr*p`ChXKsq*dOrpW{Hb z&u`*j{B6dH_X!htow9hh83S*i!;FGri4z13elh~0!6Jb&lNKG0kAk`kQrd&@FTqcr zN9z)Z9hslY#Jg`{g{qYq@*4WM8H&=&J*pSS_`i?AzIuuCCj8#Tp?+NE%s$+Rx{>8E z?#I;pC&=o;79nWGD7SDIN703i{An8;`3~9h`wYkWud|Bqj?!R}z?-(~N{6u{SYllU z6+dP$ir;FnAxNOiV?>9O2eCkPs&U{d#sunV0qT#ckjAGcyGcQWqngK9Mk zcVZ%rPl3ZqmN#&p_=JymI+ozrYFVY0oZPM4ttk2~6nRHe ztYmG$xPQVonYYf%k-J_$;E-}$s2FZ;?5-LJK>-h#)dgz`*5t4CkNL;_`Bi zBlW7Ke~b0h4F+RHv?Hw~wJ_;bNA4uYy4-asea>fQ(o7Erk%BQs^^MAQ?1iVk{+I52 zN&}03`3m}U6>;tWjy;EfrW&No_=HJqgFM~cz;r}G$74+nnqG&6my{Or9VWX5Yled( zMWX?|ABvM4D>aDmu%%gT3s;4HT~`8Y~&0}FrsDkIYK+6it(GX$M3^jQA{VHnfaNcpfA39OSv?rK@yvGKNY z^&NVcYRCEp>xZhMp;*a?E>%c_$b)dyk2uYbv~$j#dEU{^&;O=CFRP}5Y^Qqfl4 z5$a@5v!|rDq>r89{!~g!|F%66ZR}`iXs_+8>?!LF_H!o&*%jqJ6mTE76GC&;pU8iZ z=(iUMs9`kkK#zps!u7v}=MxvE|KSf9k?v}jFb5;dn)5Z6gbo&is(~0V2+7lltZ#{# zl3MfX*;w9D#oI|~F)-9JR5w&PTox@IDIOu?#b`j+DWSMlILg0yV_(;9SdY54=T78D zi~53{<*ijsHTBe_u5GMts%)ufEpH1!GSsy+vD7oW!(;XRjqS~iZFL>hofX|>y`jER zp(ygKB)UM6Ow(%#v#&$~T6oszSF`8T^}p2fX&OlW)7NNm zS4DSuPpCI&P>gd-?rY*_x9VRlqkS9v7O(#$pHEzr{s&pVdPlp>>cK&DnBZZ+xa~GK zf>w%*MdbW@^!@65XeS@egN+SkCSAhNaOY6dP#A(0(X!zn7P5G>NHVXC1V&``p(w)f zJFHPWWj*fL>)Mh#=^G_=Uw=tYu%oQ4qPeoMx}h#oR~xRauW4ACSn8R5;Z5Pyk%5M; z##SnVN80MzYlLA}c~4nSP$)7Qh%~0}HAA^1LZE+&-{SSZ`SVLOVEp-OYEJtK8hR3A zu5SYfL2QZeDUN2s;DrLw82p++)Urczq^xBcNQ;c+Ceub~ry zO)aExL<*}rDurQpSx*RxFkJfiR}vMJ@SH2_SLxROjPnMlCH@yaA27l_$Lhgh%y|@% zu0#esc880Rut3%qK19P`V4wH|Q_Xdl6+K%5DG;c}FdXd}ZX0SG4iD7|!I&Z_QaEBT zEF39-;b+{1JKAN-N&7*^F4txj3D){Y3Wfp$q?_v!hGngl&6N#ROA|{yb0EAeye={t zSykWD(9zgRnw13BBnSp|DTEdRhBscj--0XwEidOO

mcNCv}CI)%wgy9@(bQ za#?1S@cle38tWVGM1vdHAFE|$sZwMx8XAV6NTJA~Jb{h=XXN~Ko@Fjsp0FKZ6pRzh zTkVe`h5e$*ieYy2JnIjJcQOKwM@Axp=x|qKyH2nLf+B@5tWQY#QCYNN-Z&!FKJQ-fY7&uVhD+DF$ zQ47gfTf<0UjjqY&`6nO9`k%pPj@z*d9hRz&Qc=pU)(1afgnSbTy~#ZB6n1($?})-! zQb@!`28O$%ZNtq-U?f^QRAUf~g~XN^QW*O_tEcZODZglW!a8f;@7V3y7AMF^h!hS6 z`UCyN+4I{S-WpyX9*e|~zynF#3TeOmbS;M>zG5~%AiE9@1b#fl($ ztfWx(FW*q*-Y`#$W5BA7ejUF#e5N0k(2%kNNl4;Ex&cp|bI30#E^%P80; zGRSF{<&K+)h>WC#gFhm#315~9a#%kAIieZAs0+=&LaFpEI~bsbJ2 zD0*z{U%te%&SjB6>x}(?V_FEhH$jm33?t!)tlwvK912fkfv1!PkLnVL4yO{7k?`~G z|8m=pjSgagyU^ekoH!53qQLIm|8Q>K84it;g&~7zOt@cjyFVM#!8nJ9?h= zhr)ZYz+0HZOvEK{puUfp45Q$@7Nf_@?ce8p{fOe%9qovT28#q@MU)OJORSHBjD&x} z{-wCLzsgMfI17m55LDLq z^X~t0+lSvsKNh4z*I3c4ex8b9+kT!CZ?NY0I<3CX)3+)B$^6@BS=yfU z-LSk0L%rOve4>2TXX+jrcTu=L?60lqf#lce@i$eU|B#)_ ztob!J`qwyjT93j|FEVO)kA&vewb`PJuA8=(EN^h;L;gyMP4#Ji2ub74 z$$fj+UmHuO^NzXZ>{qNeEVnE#B8k%5cNE7D>90=m)2#WqyQ_jddE>6__5;@AmN|TK zu4Aunb9aoE-^GXFLv1!V=e_ED&i119HOreY{2=MGzKI5Wn|mf&@UXwuSMCTL^d8SS z>v+=kwDqR-mgNo>_Dig|3PtJb`}FOj?D-9Zs#^ksxf8Cfj=k0y%SjksV#a*~tNj8x zeOGaOQ=2V6?SIn!wBtG39qV0-A))#mGxS}g|6zYk)^07C&Oe-cGUtr*qW!Avx?*_S zBJ!eBd|OFlw)*QHE{#NjUHQ@6N!J#~9{WM-QBsc2u|IQ_SV!vj)VSN@kyloL_c@2Gj zgIQnJ{GvroH5DyIeg07hPC0hj_t}o1$&@ODp_JMaeZB?1Q$=&WOB4-%27_X*S;tb!1;YX~bZ^J*}ZDnJ=idFjqEREP7<7g||%gV5BgXFxNO+IgS1K!Y%)BbDd z*oR2`;`X}>W*EEQQ#RFT-}r3#f4q4^&6d#aqJ8;?@{Z-6a-VUXb3W;~iWFYg8CqYk z+(DbOmcP{no#o+hd24Y`!BE~B&t%SKitSC?_alWzVR#x{zQBk$TYaMROu_lQOP;4F zc@06k5PStG{F3F3B!=QTwvE~POX$FR%F5`vEj<-lmgW-sUS#?%jD@&ZyssvaFynU| zSE^a==Zk1ry7q118rGzrWh@gLXN)Wc=dWqwt?TQi%6F7Z2lo4qet*on*0T zk-%#dY<^Y|d{OjRN#UC?6#FGFJ9~x)(f((l_$4(L`m*YaXgEGZjRDIzK2%qcqq|Ue z2b(AQug{ju_&pE32)-G!1pTLNf1Orc4dc^Tz!%l0%M8zG{S$4xeO+X8)wa-{qJ8M_ zjQ51+l>4;ntn-5NvMz-%yooMn&o95Xw5h75zO1dJCm;n!#}vVB6rI^?KVTOrJZ?E< znXNe(2k;`2Pk)JP3|ses zHr6rOu(^6m+0K&bLK6D;5M(42Jw^(% z<~QzNRn!q`tgekzAc4ib=y24##vq6_-h&iUz8ZcfOBn(4FX$484kr;*wnX$ewbMbo zE5wz150Y>SN|!afho?qxf=f-B_9d*VMay?X zu$LdVbB9jAP+Gq!{8~0d(BSeN!9B%$1Ef)76g-}LQpQ0=@B(^##gQ#P&$w@}urt_H zSyxxt7-}u)3iKBYi5Qysk1YSFJJJN#qqz3+TYL38pUa-RI^Dnx{i43`x!S+X=AYjY>#GPun#TX(GT zWaw1!nZh}2h>}3hHPK;3@D`Tj1=}60@q+HEU&W2P4NYB(U?2!G0!qgF7F;fJa~$FX zEoaA&lU?a9Px+hSADJBgyQux047+{YHr(_J`0RG(Q{%|^Zax}*?UPL#!)S2D_Rt=Y zz`%Z)$zVs~GRT~I&UH`xdP~|uja7BERSjjWNMJWQJm`;NN7i}_g2L}a=v2w+LNr)x zh%SLjhi}SkFitR~vx#5)jP*^7MkB&cMnG}uk@)FitH*86#{;dI4v!**X0P<|zbWk> z1k^@^t_`Q6Tsz*A;)FlI2W~{h$NI)Qupp7mHB*(?5F}7p5oR*8dB?pc^l=b^+4J*8 z3;IerLQR$7T1h&lB(PWMa11+Q5LEoiPU$0{vLTFto~NYby0Rk%!KM12hpv!f)PxvIpaW+mt=W1Hp2g@6rvZGvdFNKK?hUeYroA%wpOY8uB$;4sM4+ zuot1+)-v6+zwtoBA$1&X;HWo_BlR2+{dmuJqv&FAuKaZ66P2f`PFA0&IZ<=G=6J1s z9IG>ql5mVY$?fc|O|!$cKYW1WpgInPjpK0mu%vX$>d4f6sRyX|iT~tg2E0LrraH!X zufyeGtd~FpvzPAwmp=YCsr{WO>nI~cJx*B!F?f`Velrrjp?h1$uC_fbdz<$)?-!Dd zdh$t$u1Y?>X?fLk(|gtbWZ-$G737v6G$h-S`VlbL@qx9NE#E4QK8@5vKG0laovhdl(k05RY7^D;I+1 zX^Dj2((%8vZ^XCD{aJ+sstgMjAWALACT+5BbN9B+o$b5Z_K><$9SM@j{JyKbXM4%{ zocn3-72l=&iv{Nk=Zel2pDj64db;#<@QKi=(5W&xJcGg3`o@m>ei*I}kFz_z0ghAn zs%=9mb(TW&!SI9by#M3|hV)fPpo-wPc$`ilX!g?mBMtw_H1WTw_otf?q8uaaw6QK! z3c-nijlEkSxP$b&$YJZW&M=9jE|K`}THdg|=y)#YhUc31O5P>^#rz8e=L6?-j>Tt6 z;0Q@itf;H1rJ=FCz9%w(SH@^~Efgn_#*K>O7A2G09~eKg^*J+sX7Rmd*N+cw=-=G4 zwR>CFc0G2lZFed|MRH%WI6m6=?BG+DMZHgIkWzo#rK+BKQ=VEYD52~-l=XPMv+0%XzPcJh)D#+|L-g0 z5PRMFitV=Jmh-0ThDf0>^j<;|VJP}s;9lz|IH|0ox;3H*cGP#{6*LfMI0{42=ZOc$ z&t!ehtp8^5y=K>s4NnfPmux>W3d+%$AQ&GJbvgV~c4NQBOzACFm0nf^pLIPWQV7E< zafVO&F8JMJzM;b2U}r^pZA+xN2^s7_k9+ZJSrr*layX(S@qx*o$@-jG|IOli&8{Dd zPN2aX`{O#S2x3c;4>cl6{)11k^zT~Uu)b=0$u2SoL6Jhy=c*5}OnZx-KccD>>!8Vo_D!$Q!oCG!;9e~5R=H`$vZejgpa zs|Y^td`|RONukd0ve!M47b_qrDb!WbUeg**5bSK|MhbOJ9>S(PFn$s}m|=a+tp8^5 zy=K>s#jqfQ>xAFD4yTTUisFy(MfnzU%@2{lH__o&Y%d~%ae}%O%4nEN_Mwsd!J@v> z?(&YR_F4!wP|8|lQ1n=rLKzJo7(YFJddc-Uv;LdK_nKX=_zkUFg$8pdJFJg`sUu;> zkMN9={0ASiyCo9%Dx;u5Q0cKqA)_JLheq>Na3&JiQQcPC8i^BZQv~BuXrwhawIgQV zU#*{JSf4Yy-dMj{D!$k3dUE|qrcvF7B#(j%M#3NAr}syUfS)J{e9Q8sd4i7T^pVhY zov{#ps|f@_gM(e=9hK;CT}v`SU5|So96wqA%dkFY)_=43UbE}rH#{i>#fGHnaLP#d zXYl(jBj8urR}~3-10BAL4Binf7J@PoG8W>755EFJTT6S&kU-I4?1&+Q=y9jg;|Hd{ z!Y{-6oLT?P;(N`m7k*e^G+5b?B?N{3&uz7!{jHHP3bF8HnS z5zM8+tCa+b4r51}8`U`2rU-UENcofX(#!F)Te!6JcgT1AInu_Y6HR_7OkpJ;HmE`jLqB7zT4{tmO1wB-7nS^xF*tEJ<6 z&8~-^Sm0z0Hl9WdJEBHH;rGW_kZ+*DA2W*)3Dk91C-?$dEHX&;ur-G56rQ>_eu214OuffSPnK(t^_Zb%Cn@WQ} zWDI=A@+La0kAg3v#X3Rwt@VxMs}Ya{2}%O7BIvN#k$HmIu#@+geSfp{IkW4{;(N`m zho5eNl>{maB4ePwqL3tk{{>&}&+w=H7IT6@PY9&S)ViOzwpZx-)nZg;-_2S#4J!zoPQco6oudSlm+=zY48Vldl?eQ zC@7X#5hVNCSRTQwY6R><0=r}kL;@28Wj@{1^uYKnvw+7k%ZF&$*8k|aI=FXuTWoV| zaCm5VC^{UC#yF(!qsB2Z%AwtI z?DKCe-B`K4b}};2INm(oI^I6sG1fWO&CxT~%hAWty0v?K-{io=AXQeR&WcfEMX#dN;IYi|AzAZVTQXcTR8NRz`%ot-V|t>!!+pd3!vn(u(N)n^vB4N(Mkw8o z_GQbH&e^=_z|NAbWmA=#YBtoZkF0B$Y@BGGXqjl8XcL;_o#Rc@9oxDo#?rrGKow&d zn&6-u3oKPRme^QK4K1epmRZ1KndL*YZ0moF7mNg>RixReC&;j6sBNg7ki)KMcT`9U zOQG2x?PoZC-ge$~%zH3@x^QR7_RyA!sjAJ@o9i}(H%2zpuWww}G}#Qx#{F%(I=6Li z>Dk=3seeQNdML7tp(aw3$t+FzEwg~fGRuc#Eq{7hl;KLGka85mjYuLChgDe=!v7UX z!Z>@GV!vv6+CG zDGOvU)Jd)znl$CN%mN z#c?+0WbRDf!Tf!J>7qR)JA>QHww7`fr@K3Lkn(wJ_f+pDq)?}rEQwGw z<(KThXZq&FZ%9oaxcI!;U$V{Lxh`*Q0Vx7YAxK#i62+*pD8mNBmLbZb#GuGS$_LuZ zmK(Mwoo66;H1Cjqf5E;&VYn-ZB$jWfP!t;uHSbg9E4Hho*}5JZ43$3X+HCgymHWjq z3V19`A87h{3*L}5KQBcAB$;!NRDnXUMi@rPzXZcZ#zfKQmf=<=iC-bX>@Dll_KOfa zm3u7ji0@$j{=nYC>Eb;lyMsGJN)j6mHIe*T6%Wt}Dk)Sln4lHp6EuS{Rk-^WIzEDD8<3en+Uv@8a};c5skkOU zpiG19s^h%tjQdpXaqo=pkpDozzQEohVYnxVBsLtD@(c0u6-K(>AVoby3Ww_`w3T4UGS{a(n!JhxUbkP? z33^U=kLDfrAIKL?jx&Vc0k%i=@)cr7^uw?wIz9e1?Lm46KGVyLH!QP!NY?TfFN+c& z!KWfPEHWqrl@yXNfjuFP29_DV#ltb_wvfPUj!VvSB7+b-mN$bQAHbf-XgFPexba|8 z`HCch-CGjYBpZH9Iq(^PO#V-Hf9u5`=pM_5+!Mbszi6fX|m(Dlmnj$IsLTZ zUUi@4L+*i}D(jIyYDi!(suL7jq6iKN!7$4VQps4#43MH6D{|Fw(RnsO@TgC$37RbY z#DXN1uh8vCT#t=8jVXjo4t%B;C%eDhSNV|a`NhkktX2}JM#11Pso2qCouF<_YT@@C zEC_`{n8iq$1}PMB!F4uAWbh<1D0+-F5q_$GPjc}9!;a_-l{GO%klBIH^zwQ4xBDs| zau57eSr5@*T>_O3&l6N5A$m+A4M`O9J{tV0lECZe@MY)u6oRupTwLKN7KDR&wC?aQ zKME$K&=i2F17GGAmYF{AUiqoQMluE}36xRLkU`y+R4abpU`qxK)?QK)C^{^5WL^f3 zdym#15%*THARP1MD;5&mW*Pu&7QmYspH|ey?n#CK;(~5GG&2MMXFY#mVVN}A6gEPG zbqN$Zyoey_Vx&-rqzrvSyK8xYG4KXDd{x;IgCK=M;{3#dhy>0%+$7r(q>%1s4gfX_ z;5FgbY_XR>aLEATuuaS8V3<8WDeFNha?xPMK%=w~iBgkfFj@saDHQSv8zgU1DC9-G zPzXAFg;2UV*BLbqp2|HDQSzr-5Mv%4*WzS5BK$H30GkHzW(G+3ux1J1L9;{vG46#@ zKV``P;B24Y^s*?5AEn78CGTpI%PI*JOKfOyh2r;ZEQl%;!YoFYz^Cmp3SJ-(hcXvf zl2bZAV+2f*z+D|l1mnG5Isn)-fHyNg6+SGLDV7KzR_=vDaESomd*Nr4^)MumnRJ4n zl*toYO!_6gP{;?2fO@GP3WZp&D;>Us9f=c+^J5i8Ka4VPN(Oh%7lTXm-a=$Tx&U3% z0Nz9kX@7`^4-3Jx0mO0lLLrzo0J!{R$A^}$v$30;0Lw^HHJ|?KM->-r*`&4lVYZ19 zeD7JZNq?1^04rU9u4w>oT5O3w#>0m-j}?Mx0*K@8g))Yu4FGPO^9o2tW9H5)L{+Bj@cR>pa zeocfAYdLlZE(#z{xEE^V+)on#+`hZ?c-}?Vb^8m}S4bWr39$53N|KN&DFIe@V{lF0 zPS;WUIqOrFXR-L{(^_Sn=qY&7d(-)n?G5X@mQ3ldzPF&e8nH2{0l=mKyo*{$`9o6p zu!OcP3Ls8$FVqN{Srh==wYBm<;Dq;l&Qp$OY|j&LO9~d|zIp;IN!KAsPrj~wI#5;J z6By6k?mFN&Zk@ASwxmyM^-;F+S3c*z>VD4gqU|;7n@aZQQ*9-uzxr-UXZ2rF19TU! z|D~Q!(?If{Qo@I&{>*{^V&R)`FVsjhu^<4rcXRFTlEe8YbI;{mc0O&t3BwmjNJaV! zMbSur_3>~h(h?lZpUBEh%i^?FZ)eM|@^s)y@6$QYIqul*T3@%m1;hFD zEy=Q<+*#x2wn4Tjcb&hQlrscf*Il2Qq2jFvVj`eu04)}_M^5ZV0h7zn%3%W*8H9*I`6yU zx#7A6LDFAY->|$5LrKkMB&m@!LZ2?t?fC%R^y`1~=a*=}`16ABVeE+A4Z%bJ@dEck zu_Gw~zys?Ww^r^hIaqKMf@gCsx~@2`Id0fUfMva7eHn(L&rlpGYOASf3HAj>eUqLk z*DmKi#}WH+>l4;wJ^k3#s{$900s(UGr4U&d}bX!}-VZPUW6) zUvOPP3ZJ$=6K6DipK13f-upS7KgGt|>s-N=?-^~ut+ z1sC(Kc%II=>4c#DCG_|;E9tM|42>i|sohQfO%KqO^_PqRyvdg={yBO5G#);TCANEz zK@~tO=NGsaN)@w20Px^s>!!Nx6?;nd2WI@o;{-1`uOfxlZ92o-l8%M6Oyv#Xs@7m{ z;gElgS0^~_*iRB|(pp(hqstd8mq?6~WuP_bo6Z!T_h0fp<-YDR2uj+VFQLiv3_n=h z;bcwi6Ek9-8Rb*f3SSvusbvzQV)#M|~$Ccv=X$F2|+thK&SRYxDa`TdKnK zm2JVEqE-1LzA;5`n~Sto4$@j-PmT-2toa=;BY8=|1>Ys_Rdo0n2tJP%3qh6s3MnLo z7!1u`4`lsM$}gvc4{IKaQxROky-@0JsQ}=i@y-p6o9nh!><&&B?GGIBAIsAT&S6W0 zpp1oRGHIDg+A8Y9)y-ucC4GS*|7hNrcRez=%|%)(^jMd|6WQ`p>6^|5&ikK40!4>O zf0ZDp(qAEkl0xhue+`d!Z)hgazG@c|xVI1;7J|7VgJ*LtxGpI@W-KHvQ*lRmV{L6i zMO(1DcpxzBC#{ttxYf1Oxd%Nat(ATDUig)sDLhB&4Us_6VbWhIf}|c|Mx)YSSZ4*?H1oS;4uUKuAWwL92S8L6voEN=~VVMR!5g&moI;3f!)E!pGPe=q!k zr&aoz!=d$OyQlYNsxNg$MIz{33%o!81AM zDDQiD!be)B($N@SAAg&_s%i>g^S}1d0yN`i_x)C{EBl=ei(itw_sME{W-?nn=skg%uGU9?e_J zY*3d$2;LJv5&)9G?!4cSK+$0mQ8Jf#-thuj3_;Tle8F#sj`dA+Y)}cD(BQH?G6s@< zNMW7 zIj^6caO>wL$#G{K>qwyVaKBcUP~_LYCT}ffV2tUUen?OBKCJV|kv&)Kf^FE}7_6!e zMM|5B+6p@hx(j;p`}_m`RsKQWkZ(9|I4|mrdBd^#I*d?JZ^1zRpnu30^~Lf=@+z9+*{Zm7$_LbAIcvl z*C#AFMtq}rqsxjd`dzRet&df*cPr)NDSW;s-~tYTv1=%7-|Z(l(v?%m2{MJ7L!aYTHn#w&|KSA*-=Ib`jXz_zM}rZ z0S44T4oGrD%@ctAE+A$2jM3&;U9=Ma0V!5X(QX(PNa1^-C>$vuuLx#GZ@sK1-O!HO zx4R~CqyD}?XK`z=sjR*-TwPmRT~`^YXb_H}mJlh}#v=Xo?TwAib*HD+7_#i)dqzqqT zv?JDt|F@J}En_4sjTS40g(8PYVuE7U{8XWN%W>;o#}?P5XEblf-yi5H>JWyZ#`1>B zNL5`;U2P316jcHF`mTmnMX;{5roE~|7?yPfyLE~seZ?8R`a^MQv@6yUtH*n}YB)|1 z?`9a{<5##^*Ji~qYksQGyf$ZL3~v{vqczwXLYqm8N&)%ss1%T|@0cejQaI1BD=1~+J4xQU zjWkXB;*|7$=zZ1p+DjTqj=O~vs6zRG;yC{fGk#OJ1#M7m?@ix62?D7yO{VD`iEm)s zDY>7cy(8_hrf5A;9My4xVoOS+B86g6@bnWoya#@&&^!`&)V9yD!?ihQooCEDnm6oU zRUn!iX9&M|0eP&5Dj<&xHjpBrPDx>cq4ZyBBMjg`wh4;6=!M=>A$3s0lVqk<3j1i{tbs9(lHG+7vul2;Xw532(5jDo6wyg?8t z6ir5#_5NGP1lvk_-8C@U3`L=|NA-d|rt5Sp-=~Gih8ui~x9@gr(+TFvNEr1GB8C0%TU0>a zkill1U@cNuliYuCOlgMDAW|?!;?9jw#KY-rihb|B3$AkzBdq|cX z(?(Z~_QWV3j|PhbQoI`-u3AK}G;4l~3eBU#6q;u~y_w}5A-Ebnj>2zI0eN&dl^}Y& zu>W2ppoY=B13i*H;=Lgnd+7}JJ##ChkVpq+c zI-_MVnotkJZsFGpKhfaEAyqzJbU1~e%&N2Jmr`h6Ng##h83lJbx4Je#Q06mhyd#QV zynsBj7+nH;gdl}~4H-m_7xzCGU8`iQYM}~9zlO)`$M`org9L5`7d0jvIUZ{OIjw_o z`{K(N{AF?c@1pjC$#t8UjN^xDIpw3PPyiWcM!5q8vuPpuLNqBl@QW9a$A-j6Z!8ih zI^3C%!6oaT=P(eJQ00U59!UZFB}$f^=W#$DC)!Z&e7ncx$fbW~FYyoBhxy)sJxz}P zO=>?wfTR}%Ex0v8YMTbZbgtleJ-Q4_#XHzC^WAm@R*fC^XRZxVntB#TcUtG zPsE*AVx`4P_ur-YWY!`6Yf{$!HX3%4DfK!=!Dhhb+dM9($FlVG`WeH&YH|E;Qu|W& z{bknW=6Lwt&L}v*;9HFm@oKi7{VfNZ4mBQbJlt@^IHc@H{fu!$W;kYhV)dIM=4Cx7 z5$KyOnhnmD&sK6&%~sFWsH1jPKk8=n!!&X1tp1Kn{dY;~E8YL^9o4^cs()jc=0=7l zzb)74%yhk+oA<9;9RC|{Uo6y;2?7VVb?@q!Zrj_kzxhD(fu{7uKZukZtZVIVjMk4u zGMDw(r@d#n;kxLZ^PdhpQFN;0Wa)|EiO}(~AdQG(tF-Fmw&e4OyTLGCyGy%oGLw8 zdNO!2bTV|JjN`%Z1CH+hXY_9lf!c)*4`W9nOfgK?|4rk6rTq*E0;3xSrh0`S5v1*V z5)36OZJ(0KgSt$vF6ymnZ)k3B>}eQ);b>%SBwbmL?Zkn-XL-r?oa5=7E1pZ{BjY2dq)DK9m zDHtg0tZi#-Zfopn=&N5)~oTuDZaxZzG%)8*b;5#Q2l{B7x zVEjzh=gj(V7T;@j{mA6dhE<#TMTdppeDMl3A|@no@0fq6xVNmcx;@e&1UnkKk-~vE z!%-NDJ|m0dFQL$i_NL{oB6!nz9Vxu(Q4G;$T@s=A!1$T0&zbe#EWX$5`jN@u^@Eb_ zXNw{j9|_};;7KDQWhn~#gWVM!wQUg|Pa%jNtFj*TLrM-Q>miaz{tZ=VMWfJ)^)@ni z(|JQlq0Uf7M3Kbv4~(D5`kYz+&Ek8_t{<6*t{d7oKh@?0UsdG#DLbCZiwoBcZ`@4`nF|28;VbNMKESSSQ%j&Pdpe6zZBRk1jDS zQD{Yb!}2QW-Qxt66zZC+7(Ot53krBFT%R-Rzgc{*+4UpiG0F8P{N{BybtF^_CrIi% zRMcPEQ{GjD4u@MCL|uRbt-M*Qj$>vLxPH;eBzyPhO}!cVs$$)n(ckx-ST2&^jZ3nGD) z=&&TTPY_gk+#_Y8;FnTp1s#6bAgD&dxD-A>exLC4w&ePpS^v%Ad(Ezg-{i0m6dR&O z!MGjPrO=pF@0#?kk)-0H!SS*lZDL0nnnjB#N&!L9<6ihB6%Uoiui5p&j~UFo1~1fMWlK6m{@^!Ms5CfU)&m`0L=b){ zg;tP2DVC%Pttf&rpSkIL=7I9dus&zje_8*_6yIxhJ^XYFJg>pV(}-b5pioLT?%_0y%}d(Ey-;%C^9#X1Z@;it+{ z6j7EUsK!9CB9!&e39@S|<6xWOmr`g&%3LTN)(O6VEk*_(D8CHrb7uWF)}K?#`zFQr znq3b+-2#gaZ;IQH#X7uAm8Hm6WhqMZ!ZM73l=X<~uso3{f{NdQLMzN;UMDtRMnTGi z*c8DB$}hwEoLT?p*Po^Qa-zI%BEHw`dc{w-z{V^vZif?3BTA1IKYx4#OqM{=VHpKw zE(1Yri9#!J9ZnE@VEkNWJr^^g2nKkjYIcwJ;O_xJjh#TtDkK0jjP-E@;F zQv;9sxr(^cza z54pYW`Sq`wXz)i^k@a|V{T{xAUpY1;fHOzVy%YR5?Vq&&!Ir~II2Mc`%D;Cn;a7pD zK`}0kUV=qz_eS8i+OM_0W7Fo#xF7AoF(`Czmp}UN7Jijza0wFVLx-LBM)0@VZ?vCj zKi0m11fIrqrVCO2y?RyS#Z!p}2XSUAK!@G;M)2RYf6@Lw?XTIikwB8e$`}8k54wAh zH`w48LIMl1B3?#8NA?rqf7kw1`=$0j+4lMaT;R@2L=1k@4}I9@gCi*XWCScj@k9cB zxG>)X!GG8OP5Vbiz`q~`_B(Mm^dVfg%sp2hTo=C3mge~du_1+6kvv907nV4ybp_$~ zf3^Ru{T16@UqJ$&#GiZv4wyLdf1%y_oqf2@Pewq|U|j;eY81?h;D2bp)&7||^nXW# zzd@kFO{~ZsB8wzY!PI&1ch<>ee2|N7gwW3_4q>(M7*!!@J$Q{r7M+v#n!Kj$1#U|L zx?j#`RH{!&`JAJcu+8R1u7iva@($xY#=A_~e+W)nAy^v!_Chh0WUC4UX=Tu_UDUTM z_}$fpEPj{O%j~C=`y>=oD1C~9VYOzrFeVf*J_yD6_Xx{J#I>$h#h;siKp)o%F3Z>F$>^?lmEMl4mFVc@uvBr2T|vw;Ak3ovqaA zhoQq`%~hXNz^GUJYwhnTNOYTbOt8jP4YFK$$hIhoUhdhW^m3p-#+@%g&yWwSMlJvA z>woFaoAmED+W*02`+00`mlm;>JBpmX9Jkx)L7Q{=EdRf1zu@WaE6faMh&LWkH>(i5 zPFXLs-rt&;$yfS3PurOr%Uu`Q5bY^JB$C(v7N0lyHuCiGV;qhz5x&`_HCU_crOtpW zPcgK6EV(i}{GSkhTZOPru~IF85#@ZmEce{wVui_f`JJ4$GdC6~^v4;>(?|09-@@}I zUrN@menQE=yV?_Ehv?E8tu?kXq%hy*g`vwM>ob3^{W(_TRqZk|xRnsaK0;?|5&gnP zOnu`hY$%UgNnt(=gQZzvS~K{nOX4{wq(vuOnXvl$B^^ z&Rk^=If|VHE`5FBZ*U9v7SG4Gu+=k!$*x0>dk7*7o3#GWp0+V^=b*<@Q77H{-~4%# zE{J^nf~U9d6ZUhP0Ly8f-8!{qOT=1jFL#tW<@xxZ(BQvB0>4U_>`g-8Bt&cjq0|FN z;UoJJ+vh@$dL*wEj?^wlWcOCipF0|I^Q#bYWh;B=GEIbal}Sn2N(tSIH?u91#oh8fQkp}>#yO9+YlEw%n9=S^Po@+IppZ!^N3X9a7D=Z#LS z6@qg9H_UND>(zfY4Q*~;xB6wzjW(=rfYz@eEkD+yg%nT<6~wi z*IB{Zk9NvTuhsG&nv9VD$SnIOtg(I%#_#i<=O`4T6MV$^NnJ(Vf-GbG?>AVGzr#ZO z06On7OSwWalxZwP6oSI$yBv6uLNU-}&X56Qd!N;%11RfK3OyDtr#{?b| zcue3ifyV?M6L?JEF@eVf9us&>;4y*61RfK3OyG+mu!_i*AF+D=E4&>4g_X>IV?F)1 ztZC}Ue`r(oyLkBgoE6UBEI9igtbP86>DT@ayBS|4Wr(zquKlByPgyzr6?-81XB(fZ zw{G6P=@pclX({l#df3(eGCnFlhT=~-`wPYSpA~7dum4r(wZiauq%d9kHg`3v^l!2^@?-W$ z{vKZhxl4N6au*+smzCX|Q|wH9#17b>(v!cYFMo&E_s=!0)ang6y6NNdNYQuD_5X?_ z{VhHH8GZeQY4`p!R~^KYW$jY!7uqd7w0;{Otk2kE`fGMFrEUEV{*!6@56c@Y?B(o6 zlV3v8zCpCh57`a-D@ft{63BH|*}K?}y@VIgN9=NcpWX1kZeA8PncmfCUyt~y&6 zzEMxGFZMdSkl#X9eu%XErD^ZP^Zu(U{@0qm{W7hDNye z|GS`lER^QR z_YrxLw?C)YS?;K@HCS7b!Kmr?&m8}&+7}DV_@G&H_>gM6#s_#_ zk(#hGV!s~sm(EY}Z}=^_E+nu39j>s~#tE8rPxn8~5dSOfOMHn>bJ+MNh|hfowrd>U zQz7`Mzm~f`w*umOmvsx5yFRx<;(PC#;d0mKR!n^F{SsW}`rHbN@4Y{Q%UqvZQSrU^ zLvWevb1N*q_udIEbA4{b#rNJT!DX(`t-$!+dn35a^|=)p-+M0vW%Xbg>vJnKzBhY< z%UqwEkH1`D@x9p+T;}@ReEj8#i|@^j;4;_e^!Upa8sD1@!T9?9{jSdmzZD$cYfe!3 zEn|Id#mD!W5nSf_-0yvS?=lN`EVF!w+1-8k&o8rp$1=-@Jp6pjzTq+pcr3Ggh}pM$ z_|Gr1fX6b+hdlgz%)a3=3wSKEe2Ce%d-%^Uvw(;G_;TOnLmqxUX5TQ`fzRx{Y0k^^ zaT(=9(zN)9{VCOf@1DQBF@3zx@*(&9c@NzMiyipx=_?Aq)bxS(borzI=5!8x_s}ch zw=jL+JzW0izcsxBU)KE+e)0f)pXEcc{+x&QjY~Q3W%-4dN*|cz<&XZ`mUiIFs&59r z#pwgHy8O|9Qzi$#EP9tIeP9-sKl*RU?7(N*zj=P?(g&Je{^-BK)PXN^7j=I3T|Ojp z;}64GvjAYT0A7^N!kEn+zT}xPF8LJuxS8qW`1h@+2#t^Bp4@605R@`QkiAt1^_3o|D_G!&CE}Q z4{L6Zv(y@a;GzIx9$To6F4+zU08XMbe| z054wuTM)o&#!rP0iv$+gtE`O>ObH;4yBDt90N`}%e<=aH=KKgB);x|JpR?3fV`)ND zlLCn2?u8oFMpkA3aQgMXqyXN`{0#S_Tr1Uj0**3Ut)*E7O2-3;nhYnZR!ir!#fH>h^s1ZT95(9vjUjK^+@S5=x4JCY7%e5ha`OZ>C z!HA^=8B_tp3HL&o#jL~t;7sd(Du6c|eky!eLjt|7LPyA6ZLL=X2_QD~o63VLGXOaA z`k!eDT7CaWg%4{UyW5@PcNRO!Z8g>gODi%MO>!^PsJOK90)Usj{%7pJp}`i93kfVh zhbwHgIzizZ5C4_=p(`%{IJ0^_KKq!!V*-x}JSOm%z+(cB2|On7n80HKj|n^`@R-13 z0*?tiCh(ZRV*(E`0T(+HA+`>yIKn{N$8LL-E8zBc?74{^mck=s`u%&5y)w2Y^OkCVQG!4R&MF$?n6odH*B|&DiE?b?m)a8evBiAbLEMT+Dynk(IZUsDKdpB+S zX%hrmG@jASdl02e(bU~+G-qS&LUTSju(ce=}CNXufPO> z(!(4a^2BDU^i@Bj=++-n_~$Ffw&nY zwZ9@01lkz6bI@bNOXTAh6oL9a1z~Pem}466M+jOY_G+w2X=ZwQ`uxT5zbWm@2=Vun zI$BW)0>$>Z&|~9n+1>uojm3Uhv9LCUegMT`b{wCGea zm}p=3ulX5wXoV#RwBi!#6n=($e1-(Rj1JFId}dS;Y_Qd4%WrY~Z@his_iM`K{S8+B zdz6iOhtfUQD4#OJjA@E80*~-ZdPs@T-%?Id52zd8S_bzKp*334zg|Kx1D3Zt&sTM zjEFLxT;}@Riiz*d2r0wKWvvJnIzBetXCH}a~^|=)q-@62?G=H97f4Q&qIeGS_yzk13?@fa$ z?Vsk?U+!ytPWY+vzAG=jH!ZG3f7I7s?t6W1MaTCpf|ceE`ufW+*+cqotN{KID_Cz( xTKj5zb!khQD^mWn!sB~W`kMBaWft&Q@$tQBp(Xusnd@`L{x9WyS7dze{|`Wm#zg=C literal 0 HcmV?d00001 diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaSearchTexture.bin b/src/Ryujinx.Graphics.Rdna3Vulkan/Effects/Textures/SmaaSearchTexture.bin new file mode 100644 index 0000000000000000000000000000000000000000..db5bf73f7d5a0b5e436d336849c90bfbc24d76dc GIT binary patch literal 1024 zcmezOkD ShaderStageFlags.VertexBit, + ShaderStage.Geometry => ShaderStageFlags.GeometryBit, + ShaderStage.TessellationControl => ShaderStageFlags.TessellationControlBit, + ShaderStage.TessellationEvaluation => ShaderStageFlags.TessellationEvaluationBit, + ShaderStage.Fragment => ShaderStageFlags.FragmentBit, + ShaderStage.Compute => ShaderStageFlags.ComputeBit, + _ => LogInvalidAndReturn(stage, nameof(ShaderStage), (ShaderStageFlags)0), + }; + } + + public static PipelineStageFlags ConvertToPipelineStageFlags(this ShaderStage stage) + { + return stage switch + { + ShaderStage.Vertex => PipelineStageFlags.VertexShaderBit, + ShaderStage.Geometry => PipelineStageFlags.GeometryShaderBit, + ShaderStage.TessellationControl => PipelineStageFlags.TessellationControlShaderBit, + ShaderStage.TessellationEvaluation => PipelineStageFlags.TessellationEvaluationShaderBit, + ShaderStage.Fragment => PipelineStageFlags.FragmentShaderBit, + ShaderStage.Compute => PipelineStageFlags.ComputeShaderBit, + _ => LogInvalidAndReturn(stage, nameof(ShaderStage), (PipelineStageFlags)0), + }; + } + + public static ShaderStageFlags Convert(this ResourceStages stages) + { + ShaderStageFlags stageFlags = stages.HasFlag(ResourceStages.Compute) + ? ShaderStageFlags.ComputeBit + : ShaderStageFlags.None; + + if (stages.HasFlag(ResourceStages.Vertex)) + { + stageFlags |= ShaderStageFlags.VertexBit; + } + + if (stages.HasFlag(ResourceStages.TessellationControl)) + { + stageFlags |= ShaderStageFlags.TessellationControlBit; + } + + if (stages.HasFlag(ResourceStages.TessellationEvaluation)) + { + stageFlags |= ShaderStageFlags.TessellationEvaluationBit; + } + + if (stages.HasFlag(ResourceStages.Geometry)) + { + stageFlags |= ShaderStageFlags.GeometryBit; + } + + if (stages.HasFlag(ResourceStages.Fragment)) + { + stageFlags |= ShaderStageFlags.FragmentBit; + } + + return stageFlags; + } + + public static DescriptorType Convert(this ResourceType type) + { + return type switch + { + ResourceType.UniformBuffer => DescriptorType.UniformBuffer, + ResourceType.StorageBuffer => DescriptorType.StorageBuffer, + ResourceType.Texture => DescriptorType.SampledImage, + ResourceType.Sampler => DescriptorType.Sampler, + ResourceType.TextureAndSampler => DescriptorType.CombinedImageSampler, + ResourceType.Image => DescriptorType.StorageImage, + ResourceType.BufferTexture => DescriptorType.UniformTexelBuffer, + ResourceType.BufferImage => DescriptorType.StorageTexelBuffer, + _ => throw new ArgumentException($"Invalid resource type \"{type}\"."), + }; + } + + public static SamplerAddressMode Convert(this AddressMode mode) + { + return mode switch + { + AddressMode.Clamp => SamplerAddressMode.ClampToEdge, // TODO: Should be clamp. + AddressMode.Repeat => SamplerAddressMode.Repeat, + AddressMode.MirrorClamp => SamplerAddressMode.ClampToEdge, // TODO: Should be mirror clamp. + AddressMode.MirrorClampToEdge => SamplerAddressMode.MirrorClampToEdgeKhr, + AddressMode.MirrorClampToBorder => SamplerAddressMode.ClampToBorder, // TODO: Should be mirror clamp to border. + AddressMode.ClampToBorder => SamplerAddressMode.ClampToBorder, + AddressMode.MirroredRepeat => SamplerAddressMode.MirroredRepeat, + AddressMode.ClampToEdge => SamplerAddressMode.ClampToEdge, + _ => LogInvalidAndReturn(mode, nameof(AddressMode), SamplerAddressMode.ClampToEdge), // TODO: Should be clamp. + }; + } + + public static BlendFactor Convert(this GAL.BlendFactor factor) + { + return factor switch + { + GAL.BlendFactor.Zero or GAL.BlendFactor.ZeroGl => BlendFactor.Zero, + GAL.BlendFactor.One or GAL.BlendFactor.OneGl => BlendFactor.One, + GAL.BlendFactor.SrcColor or GAL.BlendFactor.SrcColorGl => BlendFactor.SrcColor, + GAL.BlendFactor.OneMinusSrcColor or GAL.BlendFactor.OneMinusSrcColorGl => BlendFactor.OneMinusSrcColor, + GAL.BlendFactor.SrcAlpha or GAL.BlendFactor.SrcAlphaGl => BlendFactor.SrcAlpha, + GAL.BlendFactor.OneMinusSrcAlpha or GAL.BlendFactor.OneMinusSrcAlphaGl => BlendFactor.OneMinusSrcAlpha, + GAL.BlendFactor.DstAlpha or GAL.BlendFactor.DstAlphaGl => BlendFactor.DstAlpha, + GAL.BlendFactor.OneMinusDstAlpha or GAL.BlendFactor.OneMinusDstAlphaGl => BlendFactor.OneMinusDstAlpha, + GAL.BlendFactor.DstColor or GAL.BlendFactor.DstColorGl => BlendFactor.DstColor, + GAL.BlendFactor.OneMinusDstColor or GAL.BlendFactor.OneMinusDstColorGl => BlendFactor.OneMinusDstColor, + GAL.BlendFactor.SrcAlphaSaturate or GAL.BlendFactor.SrcAlphaSaturateGl => BlendFactor.SrcAlphaSaturate, + GAL.BlendFactor.Src1Color or GAL.BlendFactor.Src1ColorGl => BlendFactor.Src1Color, + GAL.BlendFactor.OneMinusSrc1Color or GAL.BlendFactor.OneMinusSrc1ColorGl => BlendFactor.OneMinusSrc1Color, + GAL.BlendFactor.Src1Alpha or GAL.BlendFactor.Src1AlphaGl => BlendFactor.Src1Alpha, + GAL.BlendFactor.OneMinusSrc1Alpha or GAL.BlendFactor.OneMinusSrc1AlphaGl => BlendFactor.OneMinusSrc1Alpha, + GAL.BlendFactor.ConstantColor => BlendFactor.ConstantColor, + GAL.BlendFactor.OneMinusConstantColor => BlendFactor.OneMinusConstantColor, + GAL.BlendFactor.ConstantAlpha => BlendFactor.ConstantAlpha, + GAL.BlendFactor.OneMinusConstantAlpha => BlendFactor.OneMinusConstantAlpha, + _ => LogInvalidAndReturn(factor, nameof(GAL.BlendFactor), BlendFactor.Zero), + }; + } + + public static BlendOp Convert(this AdvancedBlendOp op) + { + return op switch + { + AdvancedBlendOp.Zero => BlendOp.ZeroExt, + AdvancedBlendOp.Src => BlendOp.SrcExt, + AdvancedBlendOp.Dst => BlendOp.DstExt, + AdvancedBlendOp.SrcOver => BlendOp.SrcOverExt, + AdvancedBlendOp.DstOver => BlendOp.DstOverExt, + AdvancedBlendOp.SrcIn => BlendOp.SrcInExt, + AdvancedBlendOp.DstIn => BlendOp.DstInExt, + AdvancedBlendOp.SrcOut => BlendOp.SrcOutExt, + AdvancedBlendOp.DstOut => BlendOp.DstOutExt, + AdvancedBlendOp.SrcAtop => BlendOp.SrcAtopExt, + AdvancedBlendOp.DstAtop => BlendOp.DstAtopExt, + AdvancedBlendOp.Xor => BlendOp.XorExt, + AdvancedBlendOp.Plus => BlendOp.PlusExt, + AdvancedBlendOp.PlusClamped => BlendOp.PlusClampedExt, + AdvancedBlendOp.PlusClampedAlpha => BlendOp.PlusClampedAlphaExt, + AdvancedBlendOp.PlusDarker => BlendOp.PlusDarkerExt, + AdvancedBlendOp.Multiply => BlendOp.MultiplyExt, + AdvancedBlendOp.Screen => BlendOp.ScreenExt, + AdvancedBlendOp.Overlay => BlendOp.OverlayExt, + AdvancedBlendOp.Darken => BlendOp.DarkenExt, + AdvancedBlendOp.Lighten => BlendOp.LightenExt, + AdvancedBlendOp.ColorDodge => BlendOp.ColordodgeExt, + AdvancedBlendOp.ColorBurn => BlendOp.ColorburnExt, + AdvancedBlendOp.HardLight => BlendOp.HardlightExt, + AdvancedBlendOp.SoftLight => BlendOp.SoftlightExt, + AdvancedBlendOp.Difference => BlendOp.DifferenceExt, + AdvancedBlendOp.Minus => BlendOp.MinusExt, + AdvancedBlendOp.MinusClamped => BlendOp.MinusClampedExt, + AdvancedBlendOp.Exclusion => BlendOp.ExclusionExt, + AdvancedBlendOp.Contrast => BlendOp.ContrastExt, + AdvancedBlendOp.Invert => BlendOp.InvertExt, + AdvancedBlendOp.InvertRGB => BlendOp.InvertRgbExt, + AdvancedBlendOp.InvertOvg => BlendOp.InvertOvgExt, + AdvancedBlendOp.LinearDodge => BlendOp.LineardodgeExt, + AdvancedBlendOp.LinearBurn => BlendOp.LinearburnExt, + AdvancedBlendOp.VividLight => BlendOp.VividlightExt, + AdvancedBlendOp.LinearLight => BlendOp.LinearlightExt, + AdvancedBlendOp.PinLight => BlendOp.PinlightExt, + AdvancedBlendOp.HardMix => BlendOp.HardmixExt, + AdvancedBlendOp.Red => BlendOp.RedExt, + AdvancedBlendOp.Green => BlendOp.GreenExt, + AdvancedBlendOp.Blue => BlendOp.BlueExt, + AdvancedBlendOp.HslHue => BlendOp.HslHueExt, + AdvancedBlendOp.HslSaturation => BlendOp.HslSaturationExt, + AdvancedBlendOp.HslColor => BlendOp.HslColorExt, + AdvancedBlendOp.HslLuminosity => BlendOp.HslLuminosityExt, + _ => LogInvalidAndReturn(op, nameof(AdvancedBlendOp), BlendOp.Add), + }; + } + + public static BlendOp Convert(this GAL.BlendOp op) + { + return op switch + { + GAL.BlendOp.Add or GAL.BlendOp.AddGl => BlendOp.Add, + GAL.BlendOp.Subtract or GAL.BlendOp.SubtractGl => BlendOp.Subtract, + GAL.BlendOp.ReverseSubtract or GAL.BlendOp.ReverseSubtractGl => BlendOp.ReverseSubtract, + GAL.BlendOp.Minimum or GAL.BlendOp.MinimumGl => BlendOp.Min, + GAL.BlendOp.Maximum or GAL.BlendOp.MaximumGl => BlendOp.Max, + _ => LogInvalidAndReturn(op, nameof(GAL.BlendOp), BlendOp.Add), + }; + } + + public static BlendOverlapEXT Convert(this AdvancedBlendOverlap overlap) + { + return overlap switch + { + AdvancedBlendOverlap.Uncorrelated => BlendOverlapEXT.UncorrelatedExt, + AdvancedBlendOverlap.Disjoint => BlendOverlapEXT.DisjointExt, + AdvancedBlendOverlap.Conjoint => BlendOverlapEXT.ConjointExt, + _ => LogInvalidAndReturn(overlap, nameof(AdvancedBlendOverlap), BlendOverlapEXT.UncorrelatedExt), + }; + } + + public static CompareOp Convert(this GAL.CompareOp op) + { + return op switch + { + GAL.CompareOp.Never or GAL.CompareOp.NeverGl => CompareOp.Never, + GAL.CompareOp.Less or GAL.CompareOp.LessGl => CompareOp.Less, + GAL.CompareOp.Equal or GAL.CompareOp.EqualGl => CompareOp.Equal, + GAL.CompareOp.LessOrEqual or GAL.CompareOp.LessOrEqualGl => CompareOp.LessOrEqual, + GAL.CompareOp.Greater or GAL.CompareOp.GreaterGl => CompareOp.Greater, + GAL.CompareOp.NotEqual or GAL.CompareOp.NotEqualGl => CompareOp.NotEqual, + GAL.CompareOp.GreaterOrEqual or GAL.CompareOp.GreaterOrEqualGl => CompareOp.GreaterOrEqual, + GAL.CompareOp.Always or GAL.CompareOp.AlwaysGl => CompareOp.Always, + _ => LogInvalidAndReturn(op, nameof(GAL.CompareOp), CompareOp.Never), + }; + } + + public static CullModeFlags Convert(this Face face) + { + return face switch + { + Face.Back => CullModeFlags.BackBit, + Face.Front => CullModeFlags.FrontBit, + Face.FrontAndBack => CullModeFlags.FrontAndBack, + _ => LogInvalidAndReturn(face, nameof(Face), CullModeFlags.BackBit), + }; + } + + public static FrontFace Convert(this GAL.FrontFace frontFace) + { + // Flipped to account for origin differences. + return frontFace switch + { + GAL.FrontFace.Clockwise => FrontFace.CounterClockwise, + GAL.FrontFace.CounterClockwise => FrontFace.Clockwise, + _ => LogInvalidAndReturn(frontFace, nameof(GAL.FrontFace), FrontFace.Clockwise), + }; + } + + public static IndexType Convert(this GAL.IndexType type) + { + return type switch + { + GAL.IndexType.UByte => IndexType.Uint8Ext, + GAL.IndexType.UShort => IndexType.Uint16, + GAL.IndexType.UInt => IndexType.Uint32, + _ => LogInvalidAndReturn(type, nameof(GAL.IndexType), IndexType.Uint16), + }; + } + + public static Filter Convert(this MagFilter filter) + { + return filter switch + { + MagFilter.Nearest => Filter.Nearest, + MagFilter.Linear => Filter.Linear, + _ => LogInvalidAndReturn(filter, nameof(MagFilter), Filter.Nearest), + }; + } + + public static (Filter, SamplerMipmapMode) Convert(this MinFilter filter) + { + return filter switch + { + MinFilter.Nearest => (Filter.Nearest, SamplerMipmapMode.Nearest), + MinFilter.Linear => (Filter.Linear, SamplerMipmapMode.Nearest), + MinFilter.NearestMipmapNearest => (Filter.Nearest, SamplerMipmapMode.Nearest), + MinFilter.LinearMipmapNearest => (Filter.Linear, SamplerMipmapMode.Nearest), + MinFilter.NearestMipmapLinear => (Filter.Nearest, SamplerMipmapMode.Linear), + MinFilter.LinearMipmapLinear => (Filter.Linear, SamplerMipmapMode.Linear), + _ => LogInvalidAndReturn(filter, nameof(MinFilter), (Filter.Nearest, SamplerMipmapMode.Nearest)), + }; + } + + public static PrimitiveTopology Convert(this GAL.PrimitiveTopology topology) + { + return topology switch + { + GAL.PrimitiveTopology.Points => PrimitiveTopology.PointList, + GAL.PrimitiveTopology.Lines => PrimitiveTopology.LineList, + GAL.PrimitiveTopology.LineStrip => PrimitiveTopology.LineStrip, + GAL.PrimitiveTopology.Triangles => PrimitiveTopology.TriangleList, + GAL.PrimitiveTopology.TriangleStrip => PrimitiveTopology.TriangleStrip, + GAL.PrimitiveTopology.TriangleFan => PrimitiveTopology.TriangleFan, + GAL.PrimitiveTopology.LinesAdjacency => PrimitiveTopology.LineListWithAdjacency, + GAL.PrimitiveTopology.LineStripAdjacency => PrimitiveTopology.LineStripWithAdjacency, + GAL.PrimitiveTopology.TrianglesAdjacency => PrimitiveTopology.TriangleListWithAdjacency, + GAL.PrimitiveTopology.TriangleStripAdjacency => PrimitiveTopology.TriangleStripWithAdjacency, + GAL.PrimitiveTopology.Patches => PrimitiveTopology.PatchList, + GAL.PrimitiveTopology.Polygon => PrimitiveTopology.TriangleFan, + GAL.PrimitiveTopology.Quads => throw new NotSupportedException("Quad topology is not available in Vulkan."), + GAL.PrimitiveTopology.QuadStrip => throw new NotSupportedException("QuadStrip topology is not available in Vulkan."), + _ => LogInvalidAndReturn(topology, nameof(GAL.PrimitiveTopology), PrimitiveTopology.TriangleList), + }; + } + + public static StencilOp Convert(this GAL.StencilOp op) + { + return op switch + { + GAL.StencilOp.Keep or GAL.StencilOp.KeepGl => StencilOp.Keep, + GAL.StencilOp.Zero or GAL.StencilOp.ZeroGl => StencilOp.Zero, + GAL.StencilOp.Replace or GAL.StencilOp.ReplaceGl => StencilOp.Replace, + GAL.StencilOp.IncrementAndClamp or GAL.StencilOp.IncrementAndClampGl => StencilOp.IncrementAndClamp, + GAL.StencilOp.DecrementAndClamp or GAL.StencilOp.DecrementAndClampGl => StencilOp.DecrementAndClamp, + GAL.StencilOp.Invert or GAL.StencilOp.InvertGl => StencilOp.Invert, + GAL.StencilOp.IncrementAndWrap or GAL.StencilOp.IncrementAndWrapGl => StencilOp.IncrementAndWrap, + GAL.StencilOp.DecrementAndWrap or GAL.StencilOp.DecrementAndWrapGl => StencilOp.DecrementAndWrap, + _ => LogInvalidAndReturn(op, nameof(GAL.StencilOp), StencilOp.Keep), + }; + } + + public static ComponentSwizzle Convert(this SwizzleComponent swizzleComponent) + { + return swizzleComponent switch + { + SwizzleComponent.Zero => ComponentSwizzle.Zero, + SwizzleComponent.One => ComponentSwizzle.One, + SwizzleComponent.Red => ComponentSwizzle.R, + SwizzleComponent.Green => ComponentSwizzle.G, + SwizzleComponent.Blue => ComponentSwizzle.B, + SwizzleComponent.Alpha => ComponentSwizzle.A, + _ => LogInvalidAndReturn(swizzleComponent, nameof(SwizzleComponent), ComponentSwizzle.Zero), + }; + } + + public static ImageType Convert(this Target target) + { + return target switch + { + Target.Texture1D or + Target.Texture1DArray or + Target.TextureBuffer => ImageType.Type1D, + Target.Texture2D or + Target.Texture2DArray or + Target.Texture2DMultisample or + Target.Cubemap or + Target.CubemapArray => ImageType.Type2D, + Target.Texture3D => ImageType.Type3D, + _ => LogInvalidAndReturn(target, nameof(Target), ImageType.Type2D), + }; + } + + public static ImageViewType ConvertView(this Target target) + { + return target switch + { + Target.Texture1D => ImageViewType.Type1D, + Target.Texture2D or Target.Texture2DMultisample => ImageViewType.Type2D, + Target.Texture3D => ImageViewType.Type3D, + Target.Texture1DArray => ImageViewType.Type1DArray, + Target.Texture2DArray => ImageViewType.Type2DArray, + Target.Cubemap => ImageViewType.TypeCube, + Target.CubemapArray => ImageViewType.TypeCubeArray, + _ => LogInvalidAndReturn(target, nameof(Target), ImageViewType.Type2D), + }; + } + + public static ImageAspectFlags ConvertAspectFlags(this Format format) + { + return format switch + { + Format.D16Unorm or Format.D32Float or Format.X8UintD24Unorm => ImageAspectFlags.DepthBit, + Format.S8Uint => ImageAspectFlags.StencilBit, + Format.D24UnormS8Uint or + Format.D32FloatS8Uint or + Format.S8UintD24Unorm => ImageAspectFlags.DepthBit | ImageAspectFlags.StencilBit, + _ => ImageAspectFlags.ColorBit, + }; + } + + public static ImageAspectFlags ConvertAspectFlags(this Format format, DepthStencilMode depthStencilMode) + { + return format switch + { + Format.D16Unorm or Format.D32Float or Format.X8UintD24Unorm => ImageAspectFlags.DepthBit, + Format.S8Uint => ImageAspectFlags.StencilBit, + Format.D24UnormS8Uint or + Format.D32FloatS8Uint or + Format.S8UintD24Unorm => depthStencilMode == DepthStencilMode.Stencil ? ImageAspectFlags.StencilBit : ImageAspectFlags.DepthBit, + _ => ImageAspectFlags.ColorBit, + }; + } + + public static LogicOp Convert(this LogicalOp op) + { + return op switch + { + LogicalOp.Clear => LogicOp.Clear, + LogicalOp.And => LogicOp.And, + LogicalOp.AndReverse => LogicOp.AndReverse, + LogicalOp.Copy => LogicOp.Copy, + LogicalOp.AndInverted => LogicOp.AndInverted, + LogicalOp.Noop => LogicOp.NoOp, + LogicalOp.Xor => LogicOp.Xor, + LogicalOp.Or => LogicOp.Or, + LogicalOp.Nor => LogicOp.Nor, + LogicalOp.Equiv => LogicOp.Equivalent, + LogicalOp.Invert => LogicOp.Invert, + LogicalOp.OrReverse => LogicOp.OrReverse, + LogicalOp.CopyInverted => LogicOp.CopyInverted, + LogicalOp.OrInverted => LogicOp.OrInverted, + LogicalOp.Nand => LogicOp.Nand, + LogicalOp.Set => LogicOp.Set, + _ => LogInvalidAndReturn(op, nameof(LogicalOp), LogicOp.Copy), + }; + } + + public static BufferAllocationType Convert(this BufferAccess access) + { + BufferAccess memType = access & BufferAccess.MemoryTypeMask; + + if (memType == BufferAccess.HostMemory || access.HasFlag(BufferAccess.Stream)) + { + return BufferAllocationType.HostMapped; + } + else if (memType == BufferAccess.DeviceMemory) + { + return BufferAllocationType.DeviceLocal; + } + else if (memType == BufferAccess.DeviceMemoryMapped) + { + return BufferAllocationType.DeviceLocalMapped; + } + + return BufferAllocationType.Auto; + } + + private static T2 LogInvalidAndReturn(T1 value, string name, T2 defaultValue = default) + { + Logger.Debug?.Print(LogClass.Gpu, $"Invalid {name} enum value: {value}."); + + return defaultValue; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FeedbackLoopAspects.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FeedbackLoopAspects.cs new file mode 100644 index 000000000..86294ac19 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FeedbackLoopAspects.cs @@ -0,0 +1,12 @@ +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + [Flags] + internal enum FeedbackLoopAspects + { + None = 0, + Color = 1 << 0, + Depth = 1 << 1, + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FenceHelper.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FenceHelper.cs new file mode 100644 index 000000000..271b6569d --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FenceHelper.cs @@ -0,0 +1,30 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class FenceHelper + { + private const ulong DefaultTimeout = 100000000; // 100ms + + public static bool AnySignaled(Vk api, Device device, ReadOnlySpan fences, ulong timeout = 0) + { + return api.WaitForFences(device, (uint)fences.Length, fences, false, timeout) == Result.Success; + } + + public static bool AllSignaled(Vk api, Device device, ReadOnlySpan fences, ulong timeout = 0) + { + return api.WaitForFences(device, (uint)fences.Length, fences, true, timeout) == Result.Success; + } + + public static void WaitAllIndefinitely(Vk api, Device device, ReadOnlySpan fences) + { + Result result; + while ((result = api.WaitForFences(device, (uint)fences.Length, fences, true, DefaultTimeout)) == Result.Timeout) + { + // Keep waiting while the fence is not signaled. + } + result.ThrowOnError(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FenceHolder.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FenceHolder.cs new file mode 100644 index 000000000..864fc9e7b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FenceHolder.cs @@ -0,0 +1,159 @@ +using Silk.NET.Vulkan; +using System; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class FenceHolder : IDisposable + { + private readonly Vk _api; + private readonly Device _device; + private Fence _fence; + private int _referenceCount; + private int _lock; + private readonly bool _concurrentWaitUnsupported; + private bool _disposed; + + public unsafe FenceHolder(Vk api, Device device, bool concurrentWaitUnsupported) + { + _api = api; + _device = device; + _concurrentWaitUnsupported = concurrentWaitUnsupported; + + var fenceCreateInfo = new FenceCreateInfo + { + SType = StructureType.FenceCreateInfo, + }; + + api.CreateFence(device, in fenceCreateInfo, null, out _fence).ThrowOnError(); + + _referenceCount = 1; + } + + public Fence GetUnsafe() + { + return _fence; + } + + public bool TryGet(out Fence fence) + { + int lastValue; + do + { + lastValue = _referenceCount; + + if (lastValue == 0) + { + fence = default; + return false; + } + } + while (Interlocked.CompareExchange(ref _referenceCount, lastValue + 1, lastValue) != lastValue); + + if (_concurrentWaitUnsupported) + { + AcquireLock(); + } + + fence = _fence; + return true; + } + + public Fence Get() + { + Interlocked.Increment(ref _referenceCount); + return _fence; + } + + public void PutLock() + { + Put(); + + if (_concurrentWaitUnsupported) + { + ReleaseLock(); + } + } + + public void Put() + { + if (Interlocked.Decrement(ref _referenceCount) == 0) + { + _api.DestroyFence(_device, _fence, Span.Empty); + _fence = default; + } + } + + private void AcquireLock() + { + while (!TryAcquireLock()) + { + Thread.SpinWait(32); + } + } + + private bool TryAcquireLock() + { + return Interlocked.Exchange(ref _lock, 1) == 0; + } + + private void ReleaseLock() + { + Interlocked.Exchange(ref _lock, 0); + } + + public void Wait() + { + if (_concurrentWaitUnsupported) + { + AcquireLock(); + + try + { + FenceHelper.WaitAllIndefinitely(_api, _device, stackalloc Fence[] { _fence }); + } + finally + { + ReleaseLock(); + } + } + else + { + FenceHelper.WaitAllIndefinitely(_api, _device, stackalloc Fence[] { _fence }); + } + } + + public bool IsSignaled() + { + if (_concurrentWaitUnsupported) + { + if (!TryAcquireLock()) + { + return false; + } + + try + { + return FenceHelper.AllSignaled(_api, _device, stackalloc Fence[] { _fence }); + } + finally + { + ReleaseLock(); + } + } + else + { + return FenceHelper.AllSignaled(_api, _device, stackalloc Fence[] { _fence }); + } + } + + public void Dispose() + { + if (!_disposed) + { + Put(); + _disposed = true; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FormatCapabilities.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FormatCapabilities.cs new file mode 100644 index 000000000..11b7f50c9 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FormatCapabilities.cs @@ -0,0 +1,233 @@ +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using Format = Ryujinx.Graphics.GAL.Format; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class FormatCapabilities + { + private static readonly GAL.Format[] _scaledFormats = { + GAL.Format.R8Uscaled, + GAL.Format.R8Sscaled, + GAL.Format.R16Uscaled, + GAL.Format.R16Sscaled, + GAL.Format.R8G8Uscaled, + GAL.Format.R8G8Sscaled, + GAL.Format.R16G16Uscaled, + GAL.Format.R16G16Sscaled, + GAL.Format.R8G8B8Uscaled, + GAL.Format.R8G8B8Sscaled, + GAL.Format.R16G16B16Uscaled, + GAL.Format.R16G16B16Sscaled, + GAL.Format.R8G8B8A8Uscaled, + GAL.Format.R8G8B8A8Sscaled, + GAL.Format.R16G16B16A16Uscaled, + GAL.Format.R16G16B16A16Sscaled, + GAL.Format.R10G10B10A2Uscaled, + GAL.Format.R10G10B10A2Sscaled, + }; + + private static readonly GAL.Format[] _intFormats = { + GAL.Format.R8Uint, + GAL.Format.R8Sint, + GAL.Format.R16Uint, + GAL.Format.R16Sint, + GAL.Format.R8G8Uint, + GAL.Format.R8G8Sint, + GAL.Format.R16G16Uint, + GAL.Format.R16G16Sint, + GAL.Format.R8G8B8Uint, + GAL.Format.R8G8B8Sint, + GAL.Format.R16G16B16Uint, + GAL.Format.R16G16B16Sint, + GAL.Format.R8G8B8A8Uint, + GAL.Format.R8G8B8A8Sint, + GAL.Format.R16G16B16A16Uint, + GAL.Format.R16G16B16A16Sint, + GAL.Format.R10G10B10A2Uint, + GAL.Format.R10G10B10A2Sint, + }; + + private readonly FormatFeatureFlags[] _bufferTable; + private readonly FormatFeatureFlags[] _optimalTable; + + private readonly Vk _api; + private readonly PhysicalDevice _physicalDevice; + + public FormatCapabilities(Vk api, PhysicalDevice physicalDevice) + { + _api = api; + _physicalDevice = physicalDevice; + + int totalFormats = Enum.GetNames().Length; + + _bufferTable = new FormatFeatureFlags[totalFormats]; + _optimalTable = new FormatFeatureFlags[totalFormats]; + } + + public bool BufferFormatsSupport(FormatFeatureFlags flags, params ReadOnlySpan formats) + { + foreach (Format format in formats) + { + if (!BufferFormatSupports(flags, format)) + { + return false; + } + } + + return true; + } + + public bool OptimalFormatsSupport(FormatFeatureFlags flags, params ReadOnlySpan formats) + { + foreach (Format format in formats) + { + if (!OptimalFormatSupports(flags, format)) + { + return false; + } + } + + return true; + } + + public bool BufferFormatSupports(FormatFeatureFlags flags, Format format) + { + var formatFeatureFlags = _bufferTable[(int)format]; + + if (formatFeatureFlags == 0) + { + _api.GetPhysicalDeviceFormatProperties(_physicalDevice, FormatTable.GetFormat(format), out var fp); + formatFeatureFlags = fp.BufferFeatures; + _bufferTable[(int)format] = formatFeatureFlags; + } + + return (formatFeatureFlags & flags) == flags; + } + + public bool SupportsScaledVertexFormats() + { + // We want to check is all scaled formats are supported, + // but if the integer variant is not supported either, + // then the format is likely not supported at all, + // we ignore formats that are entirely unsupported here. + + for (int i = 0; i < _scaledFormats.Length; i++) + { + if (!BufferFormatSupports(FormatFeatureFlags.VertexBufferBit, _scaledFormats[i]) && + BufferFormatSupports(FormatFeatureFlags.VertexBufferBit, _intFormats[i])) + { + return false; + } + } + + return true; + } + + public bool BufferFormatSupports(FormatFeatureFlags flags, VkFormat format) + { + _api.GetPhysicalDeviceFormatProperties(_physicalDevice, format, out var fp); + + return (fp.BufferFeatures & flags) == flags; + } + + public bool OptimalFormatSupports(FormatFeatureFlags flags, Format format) + { + var formatFeatureFlags = _optimalTable[(int)format]; + + if (formatFeatureFlags == 0) + { + _api.GetPhysicalDeviceFormatProperties(_physicalDevice, FormatTable.GetFormat(format), out var fp); + formatFeatureFlags = fp.OptimalTilingFeatures; + _optimalTable[(int)format] = formatFeatureFlags; + } + + return (formatFeatureFlags & flags) == flags; + } + + public VkFormat ConvertToVkFormat(Format srcFormat, bool storageFeatureFlagRequired) + { + var format = FormatTable.GetFormat(srcFormat); + + var requiredFeatures = FormatFeatureFlags.SampledImageBit | + FormatFeatureFlags.TransferSrcBit | + FormatFeatureFlags.TransferDstBit; + + if (srcFormat.IsDepthOrStencil()) + { + requiredFeatures |= FormatFeatureFlags.DepthStencilAttachmentBit; + } + else if (srcFormat.IsRtColorCompatible()) + { + requiredFeatures |= FormatFeatureFlags.ColorAttachmentBit; + } + + if (srcFormat.IsImageCompatible() && storageFeatureFlagRequired) + { + requiredFeatures |= FormatFeatureFlags.StorageImageBit; + } + + if (!OptimalFormatSupports(requiredFeatures, srcFormat) || (IsD24S8(srcFormat) && VulkanConfiguration.ForceD24S8Unsupported)) + { + // The format is not supported. Can we convert it to a higher precision format? + if (IsD24S8(srcFormat)) + { + format = VkFormat.D32SfloatS8Uint; + } + else if (srcFormat == Format.R4G4B4A4Unorm) + { + format = VkFormat.R4G4B4A4UnormPack16; + } + else + { + Logger.Error?.Print(LogClass.Gpu, $"Format {srcFormat} is not supported by the host."); + } + } + + return format; + } + + public VkFormat ConvertToVertexVkFormat(Format srcFormat) + { + var format = FormatTable.GetFormat(srcFormat); + + if (!BufferFormatSupports(FormatFeatureFlags.VertexBufferBit, srcFormat) || + (IsRGB16IntFloat(srcFormat) && VulkanConfiguration.ForceRGB16IntFloatUnsupported)) + { + // The format is not supported. Can we convert it to an alternative format? + switch (srcFormat) + { + case Format.R16G16B16Float: + format = VkFormat.R16G16B16A16Sfloat; + break; + case Format.R16G16B16Sint: + format = VkFormat.R16G16B16A16Sint; + break; + case Format.R16G16B16Uint: + format = VkFormat.R16G16B16A16Uint; + break; + default: + Logger.Error?.Print(LogClass.Gpu, $"Format {srcFormat} is not supported by the host."); + break; + } + } + + return format; + } + + public static bool IsD24S8(Format format) + { + return format == Format.D24UnormS8Uint || format == Format.S8UintD24Unorm || format == Format.X8UintD24Unorm; + } + + private static bool IsRGB16IntFloat(Format format) + { + return format == Format.R16G16B16Float || + format == Format.R16G16B16Sint || + format == Format.R16G16B16Uint; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FormatConverter.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FormatConverter.cs new file mode 100644 index 000000000..634b8f1b9 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FormatConverter.cs @@ -0,0 +1,49 @@ +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class FormatConverter + { + public static void ConvertD24S8ToD32FS8(Span output, ReadOnlySpan input) + { + const float UnormToFloat = 1f / 0xffffff; + + Span outputUint = MemoryMarshal.Cast(output); + ReadOnlySpan inputUint = MemoryMarshal.Cast(input); + + int i = 0; + + for (; i < inputUint.Length; i++) + { + uint depthStencil = inputUint[i]; + uint depth = depthStencil >> 8; + uint stencil = depthStencil & 0xff; + + int j = i * 2; + + outputUint[j] = (uint)BitConverter.SingleToInt32Bits(depth * UnormToFloat); + outputUint[j + 1] = stencil; + } + } + + public static void ConvertD32FS8ToD24S8(Span output, ReadOnlySpan input) + { + Span outputUint = MemoryMarshal.Cast(output); + ReadOnlySpan inputUint = MemoryMarshal.Cast(input); + + int i = 0; + + for (; i < inputUint.Length; i += 2) + { + float depth = BitConverter.Int32BitsToSingle((int)inputUint[i]); + uint stencil = inputUint[i + 1]; + uint depthStencil = (Math.Clamp((uint)(depth * 0xffffff), 0, 0xffffff) << 8) | (stencil & 0xff); + + int j = i >> 1; + + outputUint[j] = depthStencil; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FormatTable.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FormatTable.cs new file mode 100644 index 000000000..2b558cf54 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FormatTable.cs @@ -0,0 +1,358 @@ +using Ryujinx.Graphics.GAL; +using System; +using System.Collections.Generic; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class FormatTable + { + private static readonly VkFormat[] _table; + private static readonly Dictionary _reverseMap; + + static FormatTable() + { + _table = new VkFormat[Enum.GetNames().Length]; + _reverseMap = new Dictionary(); + +#pragma warning disable IDE0055 // Disable formatting + Add(Format.R8Unorm, VkFormat.R8Unorm); + Add(Format.R8Snorm, VkFormat.R8SNorm); + Add(Format.R8Uint, VkFormat.R8Uint); + Add(Format.R8Sint, VkFormat.R8Sint); + Add(Format.R16Float, VkFormat.R16Sfloat); + Add(Format.R16Unorm, VkFormat.R16Unorm); + Add(Format.R16Snorm, VkFormat.R16SNorm); + Add(Format.R16Uint, VkFormat.R16Uint); + Add(Format.R16Sint, VkFormat.R16Sint); + Add(Format.R32Float, VkFormat.R32Sfloat); + Add(Format.R32Uint, VkFormat.R32Uint); + Add(Format.R32Sint, VkFormat.R32Sint); + Add(Format.R8G8Unorm, VkFormat.R8G8Unorm); + Add(Format.R8G8Snorm, VkFormat.R8G8SNorm); + Add(Format.R8G8Uint, VkFormat.R8G8Uint); + Add(Format.R8G8Sint, VkFormat.R8G8Sint); + Add(Format.R16G16Float, VkFormat.R16G16Sfloat); + Add(Format.R16G16Unorm, VkFormat.R16G16Unorm); + Add(Format.R16G16Snorm, VkFormat.R16G16SNorm); + Add(Format.R16G16Uint, VkFormat.R16G16Uint); + Add(Format.R16G16Sint, VkFormat.R16G16Sint); + Add(Format.R32G32Float, VkFormat.R32G32Sfloat); + Add(Format.R32G32Uint, VkFormat.R32G32Uint); + Add(Format.R32G32Sint, VkFormat.R32G32Sint); + Add(Format.R8G8B8Unorm, VkFormat.R8G8B8Unorm); + Add(Format.R8G8B8Snorm, VkFormat.R8G8B8SNorm); + Add(Format.R8G8B8Uint, VkFormat.R8G8B8Uint); + Add(Format.R8G8B8Sint, VkFormat.R8G8B8Sint); + Add(Format.R16G16B16Float, VkFormat.R16G16B16Sfloat); + Add(Format.R16G16B16Unorm, VkFormat.R16G16B16Unorm); + Add(Format.R16G16B16Snorm, VkFormat.R16G16B16SNorm); + Add(Format.R16G16B16Uint, VkFormat.R16G16B16Uint); + Add(Format.R16G16B16Sint, VkFormat.R16G16B16Sint); + Add(Format.R32G32B32Float, VkFormat.R32G32B32Sfloat); + Add(Format.R32G32B32Uint, VkFormat.R32G32B32Uint); + Add(Format.R32G32B32Sint, VkFormat.R32G32B32Sint); + Add(Format.R8G8B8A8Unorm, VkFormat.R8G8B8A8Unorm); + Add(Format.R8G8B8A8Snorm, VkFormat.R8G8B8A8SNorm); + Add(Format.R8G8B8A8Uint, VkFormat.R8G8B8A8Uint); + Add(Format.R8G8B8A8Sint, VkFormat.R8G8B8A8Sint); + Add(Format.R16G16B16A16Float, VkFormat.R16G16B16A16Sfloat); + Add(Format.R16G16B16A16Unorm, VkFormat.R16G16B16A16Unorm); + Add(Format.R16G16B16A16Snorm, VkFormat.R16G16B16A16SNorm); + Add(Format.R16G16B16A16Uint, VkFormat.R16G16B16A16Uint); + Add(Format.R16G16B16A16Sint, VkFormat.R16G16B16A16Sint); + Add(Format.R32G32B32A32Float, VkFormat.R32G32B32A32Sfloat); + Add(Format.R32G32B32A32Uint, VkFormat.R32G32B32A32Uint); + Add(Format.R32G32B32A32Sint, VkFormat.R32G32B32A32Sint); + Add(Format.S8Uint, VkFormat.S8Uint); + Add(Format.D16Unorm, VkFormat.D16Unorm); + Add(Format.S8UintD24Unorm, VkFormat.D24UnormS8Uint); + Add(Format.X8UintD24Unorm, VkFormat.X8D24UnormPack32); + Add(Format.D32Float, VkFormat.D32Sfloat); + Add(Format.D24UnormS8Uint, VkFormat.D24UnormS8Uint); + Add(Format.D32FloatS8Uint, VkFormat.D32SfloatS8Uint); + Add(Format.R8G8B8A8Srgb, VkFormat.R8G8B8A8Srgb); + Add(Format.R4G4Unorm, VkFormat.R4G4UnormPack8); + Add(Format.R4G4B4A4Unorm, VkFormat.A4B4G4R4UnormPack16Ext); + Add(Format.R5G5B5X1Unorm, VkFormat.A1R5G5B5UnormPack16); + Add(Format.R5G5B5A1Unorm, VkFormat.A1R5G5B5UnormPack16); + Add(Format.R5G6B5Unorm, VkFormat.R5G6B5UnormPack16); + Add(Format.R10G10B10A2Unorm, VkFormat.A2B10G10R10UnormPack32); + Add(Format.R10G10B10A2Uint, VkFormat.A2B10G10R10UintPack32); + Add(Format.R11G11B10Float, VkFormat.B10G11R11UfloatPack32); + Add(Format.R9G9B9E5Float, VkFormat.E5B9G9R9UfloatPack32); + Add(Format.Bc1RgbaUnorm, VkFormat.BC1RgbaUnormBlock); + Add(Format.Bc2Unorm, VkFormat.BC2UnormBlock); + Add(Format.Bc3Unorm, VkFormat.BC3UnormBlock); + Add(Format.Bc1RgbaSrgb, VkFormat.BC1RgbaSrgbBlock); + Add(Format.Bc2Srgb, VkFormat.BC2SrgbBlock); + Add(Format.Bc3Srgb, VkFormat.BC3SrgbBlock); + Add(Format.Bc4Unorm, VkFormat.BC4UnormBlock); + Add(Format.Bc4Snorm, VkFormat.BC4SNormBlock); + Add(Format.Bc5Unorm, VkFormat.BC5UnormBlock); + Add(Format.Bc5Snorm, VkFormat.BC5SNormBlock); + Add(Format.Bc7Unorm, VkFormat.BC7UnormBlock); + Add(Format.Bc7Srgb, VkFormat.BC7SrgbBlock); + Add(Format.Bc6HSfloat, VkFormat.BC6HSfloatBlock); + Add(Format.Bc6HUfloat, VkFormat.BC6HUfloatBlock); + Add(Format.Etc2RgbUnorm, VkFormat.Etc2R8G8B8UnormBlock); + Add(Format.Etc2RgbaUnorm, VkFormat.Etc2R8G8B8A8UnormBlock); + Add(Format.Etc2RgbPtaUnorm, VkFormat.Etc2R8G8B8A1UnormBlock); + Add(Format.Etc2RgbSrgb, VkFormat.Etc2R8G8B8SrgbBlock); + Add(Format.Etc2RgbaSrgb, VkFormat.Etc2R8G8B8A8SrgbBlock); + Add(Format.Etc2RgbPtaSrgb, VkFormat.Etc2R8G8B8A1SrgbBlock); + Add(Format.R8Uscaled, VkFormat.R8Uscaled); + Add(Format.R8Sscaled, VkFormat.R8Sscaled); + Add(Format.R16Uscaled, VkFormat.R16Uscaled); + Add(Format.R16Sscaled, VkFormat.R16Sscaled); + // Add(Format.R32Uscaled, VkFormat.R32Uscaled); + // Add(Format.R32Sscaled, VkFormat.R32Sscaled); + Add(Format.R8G8Uscaled, VkFormat.R8G8Uscaled); + Add(Format.R8G8Sscaled, VkFormat.R8G8Sscaled); + Add(Format.R16G16Uscaled, VkFormat.R16G16Uscaled); + Add(Format.R16G16Sscaled, VkFormat.R16G16Sscaled); + // Add(Format.R32G32Uscaled, VkFormat.R32G32Uscaled); + // Add(Format.R32G32Sscaled, VkFormat.R32G32Sscaled); + Add(Format.R8G8B8Uscaled, VkFormat.R8G8B8Uscaled); + Add(Format.R8G8B8Sscaled, VkFormat.R8G8B8Sscaled); + Add(Format.R16G16B16Uscaled, VkFormat.R16G16B16Uscaled); + Add(Format.R16G16B16Sscaled, VkFormat.R16G16B16Sscaled); + // Add(Format.R32G32B32Uscaled, VkFormat.R32G32B32Uscaled); + // Add(Format.R32G32B32Sscaled, VkFormat.R32G32B32Sscaled); + Add(Format.R8G8B8A8Uscaled, VkFormat.R8G8B8A8Uscaled); + Add(Format.R8G8B8A8Sscaled, VkFormat.R8G8B8A8Sscaled); + Add(Format.R16G16B16A16Uscaled, VkFormat.R16G16B16A16Uscaled); + Add(Format.R16G16B16A16Sscaled, VkFormat.R16G16B16A16Sscaled); + // Add(Format.R32G32B32A32Uscaled, VkFormat.R32G32B32A32Uscaled); + // Add(Format.R32G32B32A32Sscaled, VkFormat.R32G32B32A32Sscaled); + Add(Format.R10G10B10A2Snorm, VkFormat.A2B10G10R10SNormPack32); + Add(Format.R10G10B10A2Sint, VkFormat.A2B10G10R10SintPack32); + Add(Format.R10G10B10A2Uscaled, VkFormat.A2B10G10R10UscaledPack32); + Add(Format.R10G10B10A2Sscaled, VkFormat.A2B10G10R10SscaledPack32); + Add(Format.Astc4x4Unorm, VkFormat.Astc4x4UnormBlock); + Add(Format.Astc5x4Unorm, VkFormat.Astc5x4UnormBlock); + Add(Format.Astc5x5Unorm, VkFormat.Astc5x5UnormBlock); + Add(Format.Astc6x5Unorm, VkFormat.Astc6x5UnormBlock); + Add(Format.Astc6x6Unorm, VkFormat.Astc6x6UnormBlock); + Add(Format.Astc8x5Unorm, VkFormat.Astc8x5UnormBlock); + Add(Format.Astc8x6Unorm, VkFormat.Astc8x6UnormBlock); + Add(Format.Astc8x8Unorm, VkFormat.Astc8x8UnormBlock); + Add(Format.Astc10x5Unorm, VkFormat.Astc10x5UnormBlock); + Add(Format.Astc10x6Unorm, VkFormat.Astc10x6UnormBlock); + Add(Format.Astc10x8Unorm, VkFormat.Astc10x8UnormBlock); + Add(Format.Astc10x10Unorm, VkFormat.Astc10x10UnormBlock); + Add(Format.Astc12x10Unorm, VkFormat.Astc12x10UnormBlock); + Add(Format.Astc12x12Unorm, VkFormat.Astc12x12UnormBlock); + Add(Format.Astc4x4Srgb, VkFormat.Astc4x4SrgbBlock); + Add(Format.Astc5x4Srgb, VkFormat.Astc5x4SrgbBlock); + Add(Format.Astc5x5Srgb, VkFormat.Astc5x5SrgbBlock); + Add(Format.Astc6x5Srgb, VkFormat.Astc6x5SrgbBlock); + Add(Format.Astc6x6Srgb, VkFormat.Astc6x6SrgbBlock); + Add(Format.Astc8x5Srgb, VkFormat.Astc8x5SrgbBlock); + Add(Format.Astc8x6Srgb, VkFormat.Astc8x6SrgbBlock); + Add(Format.Astc8x8Srgb, VkFormat.Astc8x8SrgbBlock); + Add(Format.Astc10x5Srgb, VkFormat.Astc10x5SrgbBlock); + Add(Format.Astc10x6Srgb, VkFormat.Astc10x6SrgbBlock); + Add(Format.Astc10x8Srgb, VkFormat.Astc10x8SrgbBlock); + Add(Format.Astc10x10Srgb, VkFormat.Astc10x10SrgbBlock); + Add(Format.Astc12x10Srgb, VkFormat.Astc12x10SrgbBlock); + Add(Format.Astc12x12Srgb, VkFormat.Astc12x12SrgbBlock); + Add(Format.B5G6R5Unorm, VkFormat.R5G6B5UnormPack16); + Add(Format.B5G5R5A1Unorm, VkFormat.A1R5G5B5UnormPack16); + Add(Format.A1B5G5R5Unorm, VkFormat.R5G5B5A1UnormPack16); + Add(Format.B8G8R8A8Unorm, VkFormat.B8G8R8A8Unorm); + Add(Format.B8G8R8A8Srgb, VkFormat.B8G8R8A8Srgb); + Add(Format.B10G10R10A2Unorm, VkFormat.A2R10G10B10UnormPack32); +#pragma warning restore IDE0055 + } + + private static void Add(Format format, VkFormat vkFormat) + { + _table[(int)format] = vkFormat; + _reverseMap[vkFormat] = format; + } + + public static VkFormat GetFormat(Format format) + { + return _table[(int)format]; + } + + public static Format GetFormat(VkFormat format) + { + if (!_reverseMap.TryGetValue(format, out Format result)) + { + return Format.B8G8R8A8Unorm; + } + + return result; + } + + public static Format ConvertRgba8SrgbToUnorm(Format format) + { + return format switch + { + Format.R8G8B8A8Srgb => Format.R8G8B8A8Unorm, + Format.B8G8R8A8Srgb => Format.B8G8R8A8Unorm, + _ => format, + }; + } + + public static int GetAttributeFormatSize(VkFormat format) + { + switch (format) + { + case VkFormat.R8Unorm: + case VkFormat.R8SNorm: + case VkFormat.R8Uint: + case VkFormat.R8Sint: + case VkFormat.R8Uscaled: + case VkFormat.R8Sscaled: + return 1; + + case VkFormat.R8G8Unorm: + case VkFormat.R8G8SNorm: + case VkFormat.R8G8Uint: + case VkFormat.R8G8Sint: + case VkFormat.R8G8Uscaled: + case VkFormat.R8G8Sscaled: + case VkFormat.R16Sfloat: + case VkFormat.R16Unorm: + case VkFormat.R16SNorm: + case VkFormat.R16Uint: + case VkFormat.R16Sint: + case VkFormat.R16Uscaled: + case VkFormat.R16Sscaled: + return 2; + + case VkFormat.R8G8B8Unorm: + case VkFormat.R8G8B8SNorm: + case VkFormat.R8G8B8Uint: + case VkFormat.R8G8B8Sint: + case VkFormat.R8G8B8Uscaled: + case VkFormat.R8G8B8Sscaled: + return 3; + + case VkFormat.R8G8B8A8Unorm: + case VkFormat.R8G8B8A8SNorm: + case VkFormat.R8G8B8A8Uint: + case VkFormat.R8G8B8A8Sint: + case VkFormat.R8G8B8A8Srgb: + case VkFormat.R8G8B8A8Uscaled: + case VkFormat.R8G8B8A8Sscaled: + case VkFormat.B8G8R8A8Unorm: + case VkFormat.B8G8R8A8Srgb: + case VkFormat.R16G16Sfloat: + case VkFormat.R16G16Unorm: + case VkFormat.R16G16SNorm: + case VkFormat.R16G16Uint: + case VkFormat.R16G16Sint: + case VkFormat.R16G16Uscaled: + case VkFormat.R16G16Sscaled: + case VkFormat.R32Sfloat: + case VkFormat.R32Uint: + case VkFormat.R32Sint: + case VkFormat.A2B10G10R10UnormPack32: + case VkFormat.A2B10G10R10UintPack32: + case VkFormat.B10G11R11UfloatPack32: + case VkFormat.E5B9G9R9UfloatPack32: + case VkFormat.A2B10G10R10SNormPack32: + case VkFormat.A2B10G10R10SintPack32: + case VkFormat.A2B10G10R10UscaledPack32: + case VkFormat.A2B10G10R10SscaledPack32: + return 4; + + case VkFormat.R16G16B16Sfloat: + case VkFormat.R16G16B16Unorm: + case VkFormat.R16G16B16SNorm: + case VkFormat.R16G16B16Uint: + case VkFormat.R16G16B16Sint: + case VkFormat.R16G16B16Uscaled: + case VkFormat.R16G16B16Sscaled: + return 6; + + case VkFormat.R16G16B16A16Sfloat: + case VkFormat.R16G16B16A16Unorm: + case VkFormat.R16G16B16A16SNorm: + case VkFormat.R16G16B16A16Uint: + case VkFormat.R16G16B16A16Sint: + case VkFormat.R16G16B16A16Uscaled: + case VkFormat.R16G16B16A16Sscaled: + case VkFormat.R32G32Sfloat: + case VkFormat.R32G32Uint: + case VkFormat.R32G32Sint: + return 8; + + case VkFormat.R32G32B32Sfloat: + case VkFormat.R32G32B32Uint: + case VkFormat.R32G32B32Sint: + return 12; + + case VkFormat.R32G32B32A32Sfloat: + case VkFormat.R32G32B32A32Uint: + case VkFormat.R32G32B32A32Sint: + return 16; + } + + return 1; + } + + public static VkFormat DropLastComponent(VkFormat format) + { + return format switch + { + VkFormat.R8G8Unorm => VkFormat.R8Unorm, + VkFormat.R8G8SNorm => VkFormat.R8SNorm, + VkFormat.R8G8Uint => VkFormat.R8Uint, + VkFormat.R8G8Sint => VkFormat.R8Sint, + VkFormat.R8G8Uscaled => VkFormat.R8Uscaled, + VkFormat.R8G8Sscaled => VkFormat.R8Sscaled, + VkFormat.R8G8B8Unorm => VkFormat.R8G8Unorm, + VkFormat.R8G8B8SNorm => VkFormat.R8G8SNorm, + VkFormat.R8G8B8Uint => VkFormat.R8G8Uint, + VkFormat.R8G8B8Sint => VkFormat.R8G8Sint, + VkFormat.R8G8B8Uscaled => VkFormat.R8G8Uscaled, + VkFormat.R8G8B8Sscaled => VkFormat.R8G8Sscaled, + VkFormat.R8G8B8A8Unorm => VkFormat.R8G8B8Unorm, + VkFormat.R8G8B8A8SNorm => VkFormat.R8G8B8SNorm, + VkFormat.R8G8B8A8Uint => VkFormat.R8G8B8Uint, + VkFormat.R8G8B8A8Sint => VkFormat.R8G8B8Sint, + VkFormat.R8G8B8A8Srgb => VkFormat.R8G8B8Srgb, + VkFormat.R8G8B8A8Uscaled => VkFormat.R8G8B8Uscaled, + VkFormat.R8G8B8A8Sscaled => VkFormat.R8G8B8Sscaled, + VkFormat.B8G8R8A8Unorm => VkFormat.B8G8R8Unorm, + VkFormat.B8G8R8A8Srgb => VkFormat.B8G8R8Srgb, + VkFormat.R16G16Sfloat => VkFormat.R16Sfloat, + VkFormat.R16G16Unorm => VkFormat.R16Unorm, + VkFormat.R16G16SNorm => VkFormat.R16SNorm, + VkFormat.R16G16Uint => VkFormat.R16Uint, + VkFormat.R16G16Sint => VkFormat.R16Sint, + VkFormat.R16G16Uscaled => VkFormat.R16Uscaled, + VkFormat.R16G16Sscaled => VkFormat.R16Sscaled, + VkFormat.R16G16B16Sfloat => VkFormat.R16G16Sfloat, + VkFormat.R16G16B16Unorm => VkFormat.R16G16Unorm, + VkFormat.R16G16B16SNorm => VkFormat.R16G16SNorm, + VkFormat.R16G16B16Uint => VkFormat.R16G16Uint, + VkFormat.R16G16B16Sint => VkFormat.R16G16Sint, + VkFormat.R16G16B16Uscaled => VkFormat.R16G16Uscaled, + VkFormat.R16G16B16Sscaled => VkFormat.R16G16Sscaled, + VkFormat.R16G16B16A16Sfloat => VkFormat.R16G16B16Sfloat, + VkFormat.R16G16B16A16Unorm => VkFormat.R16G16B16Unorm, + VkFormat.R16G16B16A16SNorm => VkFormat.R16G16B16SNorm, + VkFormat.R16G16B16A16Uint => VkFormat.R16G16B16Uint, + VkFormat.R16G16B16A16Sint => VkFormat.R16G16B16Sint, + VkFormat.R16G16B16A16Uscaled => VkFormat.R16G16B16Uscaled, + VkFormat.R16G16B16A16Sscaled => VkFormat.R16G16B16Sscaled, + VkFormat.R32G32Sfloat => VkFormat.R32Sfloat, + VkFormat.R32G32Uint => VkFormat.R32Uint, + VkFormat.R32G32Sint => VkFormat.R32Sint, + VkFormat.R32G32B32Sfloat => VkFormat.R32G32Sfloat, + VkFormat.R32G32B32Uint => VkFormat.R32G32Uint, + VkFormat.R32G32B32Sint => VkFormat.R32G32Sint, + VkFormat.R32G32B32A32Sfloat => VkFormat.R32G32B32Sfloat, + VkFormat.R32G32B32A32Uint => VkFormat.R32G32B32Uint, + VkFormat.R32G32B32A32Sint => VkFormat.R32G32B32Sint, + _ => format, + }; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/FramebufferParams.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/FramebufferParams.cs new file mode 100644 index 000000000..a227e8360 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/FramebufferParams.cs @@ -0,0 +1,344 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Linq; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class FramebufferParams + { + private readonly Device _device; + private readonly Auto[] _attachments; + private readonly TextureView[] _colors; + private readonly TextureView _depthStencil; + private readonly TextureView[] _colorsCanonical; + private readonly TextureView _baseAttachment; + private readonly uint _validColorAttachments; + + public uint Width { get; } + public uint Height { get; } + public uint Layers { get; } + + public uint[] AttachmentSamples { get; } + public VkFormat[] AttachmentFormats { get; } + public int[] AttachmentIndices { get; } + public uint AttachmentIntegerFormatMask { get; } + public bool LogicOpsAllowed { get; } + + public int AttachmentsCount { get; } + public int MaxColorAttachmentIndex => AttachmentIndices.Length > 0 ? AttachmentIndices[^1] : -1; + public bool HasDepthStencil { get; } + public int ColorAttachmentsCount => AttachmentsCount - (HasDepthStencil ? 1 : 0); + + public FramebufferParams(Device device, TextureView view, uint width, uint height) + { + var format = view.Info.Format; + + bool isDepthStencil = format.IsDepthOrStencil(); + + _device = device; + _attachments = new[] { view.GetImageViewForAttachment() }; + _validColorAttachments = isDepthStencil ? 0u : 1u; + _baseAttachment = view; + + if (isDepthStencil) + { + _depthStencil = view; + } + else + { + _colors = new TextureView[] { view }; + _colorsCanonical = _colors; + } + + Width = width; + Height = height; + Layers = 1; + + AttachmentSamples = new[] { (uint)view.Info.Samples }; + AttachmentFormats = new[] { view.VkFormat }; + AttachmentIndices = isDepthStencil ? Array.Empty() : new[] { 0 }; + AttachmentIntegerFormatMask = format.IsInteger() ? 1u : 0u; + LogicOpsAllowed = !format.IsFloatOrSrgb(); + + AttachmentsCount = 1; + + HasDepthStencil = isDepthStencil; + } + + public FramebufferParams(Device device, ITexture[] colors, ITexture depthStencil) + { + _device = device; + + int colorsCount = colors.Count(IsValidTextureView); + + int count = colorsCount + (IsValidTextureView(depthStencil) ? 1 : 0); + + _attachments = new Auto[count]; + _colors = new TextureView[colorsCount]; + _colorsCanonical = colors.Select(color => color is TextureView view && view.Valid ? view : null).ToArray(); + + AttachmentSamples = new uint[count]; + AttachmentFormats = new VkFormat[count]; + AttachmentIndices = new int[colorsCount]; + + uint width = uint.MaxValue; + uint height = uint.MaxValue; + uint layers = uint.MaxValue; + + int index = 0; + int bindIndex = 0; + uint attachmentIntegerFormatMask = 0; + bool allFormatsFloatOrSrgb = colorsCount != 0; + + foreach (ITexture color in colors) + { + if (IsValidTextureView(color)) + { + var texture = (TextureView)color; + + _attachments[index] = texture.GetImageViewForAttachment(); + _colors[index] = texture; + _validColorAttachments |= 1u << bindIndex; + _baseAttachment = texture; + + AttachmentSamples[index] = (uint)texture.Info.Samples; + AttachmentFormats[index] = texture.VkFormat; + AttachmentIndices[index] = bindIndex; + + var format = texture.Info.Format; + + if (format.IsInteger()) + { + attachmentIntegerFormatMask |= 1u << bindIndex; + } + + allFormatsFloatOrSrgb &= format.IsFloatOrSrgb(); + + width = Math.Min(width, (uint)texture.Width); + height = Math.Min(height, (uint)texture.Height); + layers = Math.Min(layers, (uint)texture.Layers); + + if (++index >= colorsCount) + { + break; + } + } + + bindIndex++; + } + + AttachmentIntegerFormatMask = attachmentIntegerFormatMask; + LogicOpsAllowed = !allFormatsFloatOrSrgb; + + if (depthStencil is TextureView dsTexture && dsTexture.Valid) + { + _attachments[count - 1] = dsTexture.GetImageViewForAttachment(); + _depthStencil = dsTexture; + _baseAttachment ??= dsTexture; + + AttachmentSamples[count - 1] = (uint)dsTexture.Info.Samples; + AttachmentFormats[count - 1] = dsTexture.VkFormat; + + width = Math.Min(width, (uint)dsTexture.Width); + height = Math.Min(height, (uint)dsTexture.Height); + layers = Math.Min(layers, (uint)dsTexture.Layers); + + HasDepthStencil = true; + } + + if (count == 0) + { + width = height = layers = 1; + } + + Width = width; + Height = height; + Layers = layers; + + AttachmentsCount = count; + } + + public Auto GetAttachment(int index) + { + if ((uint)index >= _attachments.Length) + { + return null; + } + + return _attachments[index]; + } + + public Auto GetDepthStencilAttachment() + { + if (!HasDepthStencil) + { + return null; + } + + return _attachments[AttachmentsCount - 1]; + } + + public ComponentType GetAttachmentComponentType(int index) + { + if (_colors != null && (uint)index < _colors.Length) + { + var format = _colors[index].Info.Format; + + if (format.IsSint()) + { + return ComponentType.SignedInteger; + } + + if (format.IsUint()) + { + return ComponentType.UnsignedInteger; + } + } + + return ComponentType.Float; + } + + public ImageAspectFlags GetDepthStencilAspectFlags() + { + if (_depthStencil == null) + { + return ImageAspectFlags.None; + } + + return _depthStencil.Info.Format.ConvertAspectFlags(); + } + + public bool IsValidColorAttachment(int bindIndex) + { + return (uint)bindIndex < Constants.MaxRenderTargets && (_validColorAttachments & (1u << bindIndex)) != 0; + } + + private static bool IsValidTextureView(ITexture texture) + { + return texture is TextureView view && view.Valid; + } + + public ClearRect GetClearRect(Rectangle scissor, int layer, int layerCount) + { + int x = scissor.X; + int y = scissor.Y; + int width = Math.Min((int)Width - scissor.X, scissor.Width); + int height = Math.Min((int)Height - scissor.Y, scissor.Height); + + return new ClearRect(new Rect2D(new Offset2D(x, y), new Extent2D((uint)width, (uint)height)), (uint)layer, (uint)layerCount); + } + + public unsafe Auto Create(Vk api, CommandBufferScoped cbs, Auto renderPass) + { + ImageView* attachments = stackalloc ImageView[_attachments.Length]; + + for (int i = 0; i < _attachments.Length; i++) + { + attachments[i] = _attachments[i].Get(cbs).Value; + } + + var framebufferCreateInfo = new FramebufferCreateInfo + { + SType = StructureType.FramebufferCreateInfo, + RenderPass = renderPass.Get(cbs).Value, + AttachmentCount = (uint)_attachments.Length, + PAttachments = attachments, + Width = Width, + Height = Height, + Layers = Layers, + }; + + api.CreateFramebuffer(_device, in framebufferCreateInfo, null, out var framebuffer).ThrowOnError(); + return new Auto(new DisposableFramebuffer(api, _device, framebuffer), null, _attachments); + } + + public TextureView[] GetAttachmentViews() + { + var result = new TextureView[_attachments.Length]; + + _colors?.CopyTo(result, 0); + + if (_depthStencil != null) + { + result[^1] = _depthStencil; + } + + return result; + } + + public RenderPassCacheKey GetRenderPassCacheKey() + { + return new RenderPassCacheKey(_depthStencil, _colorsCanonical); + } + + public void InsertLoadOpBarriers(VulkanRenderer gd, CommandBufferScoped cbs) + { + if (_colors != null) + { + foreach (var color in _colors) + { + // If Clear or DontCare were used, this would need to be write bit. + color.Storage?.QueueLoadOpBarrier(cbs, false); + } + } + + _depthStencil?.Storage?.QueueLoadOpBarrier(cbs, true); + + gd.Barriers.Flush(cbs, false, null, null); + } + + public void AddStoreOpUsage() + { + if (_colors != null) + { + foreach (var color in _colors) + { + color.Storage?.AddStoreOpUsage(false); + } + } + + _depthStencil?.Storage?.AddStoreOpUsage(true); + } + + public void ClearBindings() + { + _depthStencil?.Storage.ClearBindings(); + + for (int i = 0; i < _colorsCanonical.Length; i++) + { + _colorsCanonical[i]?.Storage.ClearBindings(); + } + } + + public void AddBindings() + { + _depthStencil?.Storage.AddBinding(_depthStencil); + + for (int i = 0; i < _colorsCanonical.Length; i++) + { + TextureView color = _colorsCanonical[i]; + color?.Storage.AddBinding(color); + } + } + + public (RenderPassHolder rpHolder, Auto framebuffer) GetPassAndFramebuffer( + VulkanRenderer gd, + Device device, + CommandBufferScoped cbs) + { + return _baseAttachment.GetPassAndFramebuffer(gd, device, cbs, this); + } + + public TextureView GetColorView(int index) + { + return _colorsCanonical[index]; + } + + public TextureView GetDepthStencilView() + { + return _depthStencil; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/HardwareCapabilities.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/HardwareCapabilities.cs new file mode 100644 index 000000000..ee77ccde7 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/HardwareCapabilities.cs @@ -0,0 +1,138 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + [Flags] + enum PortabilitySubsetFlags + { + None = 0, + + NoTriangleFans = 1, + NoPointMode = 1 << 1, + No3DImageView = 1 << 2, + NoLodBias = 1 << 3, + } + + readonly struct HardwareCapabilities + { + public readonly bool SupportsIndexTypeUint8; + public readonly bool SupportsCustomBorderColor; + public readonly bool SupportsBlendEquationAdvanced; + public readonly bool SupportsBlendEquationAdvancedCorrelatedOverlap; + public readonly bool SupportsBlendEquationAdvancedNonPreMultipliedSrcColor; + public readonly bool SupportsBlendEquationAdvancedNonPreMultipliedDstColor; + public readonly bool SupportsIndirectParameters; + public readonly bool SupportsFragmentShaderInterlock; + public readonly bool SupportsGeometryShaderPassthrough; + public readonly bool SupportsShaderFloat64; + public readonly bool SupportsShaderInt8; + public readonly bool SupportsShaderStencilExport; + public readonly bool SupportsShaderStorageImageMultisample; + public readonly bool SupportsConditionalRendering; + public readonly bool SupportsExtendedDynamicState; + public readonly bool SupportsMultiView; + public readonly bool SupportsNullDescriptors; + public readonly bool SupportsPushDescriptors; + public readonly uint MaxPushDescriptors; + public readonly bool SupportsPrimitiveTopologyListRestart; + public readonly bool SupportsPrimitiveTopologyPatchListRestart; + public readonly bool SupportsTransformFeedback; + public readonly bool SupportsTransformFeedbackQueries; + public readonly bool SupportsPreciseOcclusionQueries; + public readonly bool SupportsPipelineStatisticsQuery; + public readonly bool SupportsGeometryShader; + public readonly bool SupportsTessellationShader; + public readonly bool SupportsViewportArray2; + public readonly bool SupportsHostImportedMemory; + public readonly bool SupportsDepthClipControl; + public readonly bool SupportsAttachmentFeedbackLoop; + public readonly bool SupportsDynamicAttachmentFeedbackLoop; + public readonly uint SubgroupSize; + public readonly SampleCountFlags SupportedSampleCounts; + public readonly PortabilitySubsetFlags PortabilitySubset; + public readonly uint VertexBufferAlignment; + public readonly uint SubTexelPrecisionBits; + public readonly ulong MinResourceAlignment; + + public HardwareCapabilities( + bool supportsIndexTypeUint8, + bool supportsCustomBorderColor, + bool supportsBlendEquationAdvanced, + bool supportsBlendEquationAdvancedCorrelatedOverlap, + bool supportsBlendEquationAdvancedNonPreMultipliedSrcColor, + bool supportsBlendEquationAdvancedNonPreMultipliedDstColor, + bool supportsIndirectParameters, + bool supportsFragmentShaderInterlock, + bool supportsGeometryShaderPassthrough, + bool supportsShaderFloat64, + bool supportsShaderInt8, + bool supportsShaderStencilExport, + bool supportsShaderStorageImageMultisample, + bool supportsConditionalRendering, + bool supportsExtendedDynamicState, + bool supportsMultiView, + bool supportsNullDescriptors, + bool supportsPushDescriptors, + uint maxPushDescriptors, + bool supportsPrimitiveTopologyListRestart, + bool supportsPrimitiveTopologyPatchListRestart, + bool supportsTransformFeedback, + bool supportsTransformFeedbackQueries, + bool supportsPreciseOcclusionQueries, + bool supportsPipelineStatisticsQuery, + bool supportsGeometryShader, + bool supportsTessellationShader, + bool supportsViewportArray2, + bool supportsHostImportedMemory, + bool supportsDepthClipControl, + bool supportsAttachmentFeedbackLoop, + bool supportsDynamicAttachmentFeedbackLoop, + uint subgroupSize, + SampleCountFlags supportedSampleCounts, + PortabilitySubsetFlags portabilitySubset, + uint vertexBufferAlignment, + uint subTexelPrecisionBits, + ulong minResourceAlignment) + { + SupportsIndexTypeUint8 = supportsIndexTypeUint8; + SupportsCustomBorderColor = supportsCustomBorderColor; + SupportsBlendEquationAdvanced = supportsBlendEquationAdvanced; + SupportsBlendEquationAdvancedCorrelatedOverlap = supportsBlendEquationAdvancedCorrelatedOverlap; + SupportsBlendEquationAdvancedNonPreMultipliedSrcColor = supportsBlendEquationAdvancedNonPreMultipliedSrcColor; + SupportsBlendEquationAdvancedNonPreMultipliedDstColor = supportsBlendEquationAdvancedNonPreMultipliedDstColor; + SupportsIndirectParameters = supportsIndirectParameters; + SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock; + SupportsGeometryShaderPassthrough = supportsGeometryShaderPassthrough; + SupportsShaderFloat64 = supportsShaderFloat64; + SupportsShaderInt8 = supportsShaderInt8; + SupportsShaderStencilExport = supportsShaderStencilExport; + SupportsShaderStorageImageMultisample = supportsShaderStorageImageMultisample; + SupportsConditionalRendering = supportsConditionalRendering; + SupportsExtendedDynamicState = supportsExtendedDynamicState; + SupportsMultiView = supportsMultiView; + SupportsNullDescriptors = supportsNullDescriptors; + SupportsPushDescriptors = supportsPushDescriptors; + MaxPushDescriptors = maxPushDescriptors; + SupportsPrimitiveTopologyListRestart = supportsPrimitiveTopologyListRestart; + SupportsPrimitiveTopologyPatchListRestart = supportsPrimitiveTopologyPatchListRestart; + SupportsTransformFeedback = supportsTransformFeedback; + SupportsTransformFeedbackQueries = supportsTransformFeedbackQueries; + SupportsPreciseOcclusionQueries = supportsPreciseOcclusionQueries; + SupportsPipelineStatisticsQuery = supportsPipelineStatisticsQuery; + SupportsGeometryShader = supportsGeometryShader; + SupportsTessellationShader = supportsTessellationShader; + SupportsViewportArray2 = supportsViewportArray2; + SupportsHostImportedMemory = supportsHostImportedMemory; + SupportsDepthClipControl = supportsDepthClipControl; + SupportsAttachmentFeedbackLoop = supportsAttachmentFeedbackLoop; + SupportsDynamicAttachmentFeedbackLoop = supportsDynamicAttachmentFeedbackLoop; + SubgroupSize = subgroupSize; + SupportedSampleCounts = supportedSampleCounts; + PortabilitySubset = portabilitySubset; + VertexBufferAlignment = vertexBufferAlignment; + SubTexelPrecisionBits = subTexelPrecisionBits; + MinResourceAlignment = minResourceAlignment; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/HashTableSlim.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/HashTableSlim.cs new file mode 100644 index 000000000..7731ac893 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/HashTableSlim.cs @@ -0,0 +1,143 @@ +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + interface IRefEquatable + { + bool Equals(ref T other); + } + + class HashTableSlim where TKey : IRefEquatable + { + private const int TotalBuckets = 16; // Must be power of 2 + private const int TotalBucketsMask = TotalBuckets - 1; + + private struct Entry + { + public int Hash; + public TKey Key; + public TValue Value; + } + + private struct Bucket + { + public int Length; + public Entry[] Entries; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Span AsSpan() + { + return Entries == null ? Span.Empty : Entries.AsSpan(0, Length); + } + } + + private readonly Bucket[] _hashTable = new Bucket[TotalBuckets]; + + public IEnumerable Keys + { + get + { + foreach (Bucket bucket in _hashTable) + { + for (int i = 0; i < bucket.Length; i++) + { + yield return bucket.Entries[i].Key; + } + } + } + } + + public IEnumerable Values + { + get + { + foreach (Bucket bucket in _hashTable) + { + for (int i = 0; i < bucket.Length; i++) + { + yield return bucket.Entries[i].Value; + } + } + } + } + + public void Add(ref TKey key, TValue value) + { + var entry = new Entry + { + Hash = key.GetHashCode(), + Key = key, + Value = value, + }; + + int hashCode = key.GetHashCode(); + int bucketIndex = hashCode & TotalBucketsMask; + + ref var bucket = ref _hashTable[bucketIndex]; + if (bucket.Entries != null) + { + int index = bucket.Length; + + if (index >= bucket.Entries.Length) + { + Array.Resize(ref bucket.Entries, index + 1); + } + + bucket.Entries[index] = entry; + } + else + { + bucket.Entries = new[] + { + entry, + }; + } + + bucket.Length++; + } + + public bool Remove(ref TKey key) + { + int hashCode = key.GetHashCode(); + + ref var bucket = ref _hashTable[hashCode & TotalBucketsMask]; + var entries = bucket.AsSpan(); + for (int i = 0; i < entries.Length; i++) + { + ref var entry = ref entries[i]; + + if (entry.Hash == hashCode && entry.Key.Equals(ref key)) + { + entries[(i + 1)..].CopyTo(entries[i..]); + bucket.Length--; + + return true; + } + } + + return false; + } + + public bool TryGetValue(ref TKey key, out TValue value) + { + int hashCode = key.GetHashCode(); + + var entries = _hashTable[hashCode & TotalBucketsMask].AsSpan(); + for (int i = 0; i < entries.Length; i++) + { + ref var entry = ref entries[i]; + + if (entry.Hash == hashCode && entry.Key.Equals(ref key)) + { + value = entry.Value; + return true; + } + } + + value = default; + return false; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/HelperShader.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/HelperShader.cs new file mode 100644 index 000000000..730e081af --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/HelperShader.cs @@ -0,0 +1,1740 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Numerics; +using CompareOp = Ryujinx.Graphics.GAL.CompareOp; +using Format = Ryujinx.Graphics.GAL.Format; +using PrimitiveTopology = Ryujinx.Graphics.GAL.PrimitiveTopology; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; +using StencilOp = Ryujinx.Graphics.GAL.StencilOp; +using Viewport = Ryujinx.Graphics.GAL.Viewport; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + enum ComponentType + { + Float, + SignedInteger, + UnsignedInteger, + } + + class HelperShader : IDisposable + { + private const int UniformBufferAlignment = 256; + private const int ConvertElementsPerWorkgroup = 32 * 100; // Work group size of 32 times 100 elements. + private const string ShaderBinariesPath = "Ryujinx.Graphics.Rdna3Vulkan/Shaders/SpirvBinaries"; + + private readonly PipelineHelperShader _pipeline; + private readonly ISampler _samplerLinear; + private readonly ISampler _samplerNearest; + private readonly IProgram _programColorBlit; + private readonly IProgram _programColorBlitMs; + private readonly IProgram _programColorBlitClearAlpha; + private readonly IProgram _programColorClearF; + private readonly IProgram _programColorClearSI; + private readonly IProgram _programColorClearUI; + private readonly IProgram _programDepthStencilClear; + private readonly IProgram _programStrideChange; + private readonly IProgram _programConvertD32S8ToD24S8; + private readonly IProgram _programConvertIndexBuffer; + private readonly IProgram _programConvertIndirectData; + private readonly IProgram _programColorCopyShortening; + private readonly IProgram _programColorCopyToNonMs; + private readonly IProgram _programColorCopyWidening; + private readonly IProgram _programColorDrawToMs; + private readonly IProgram _programDepthBlit; + private readonly IProgram _programDepthBlitMs; + private readonly IProgram _programDepthDrawToMs; + private readonly IProgram _programDepthDrawToNonMs; + private readonly IProgram _programStencilBlit; + private readonly IProgram _programStencilBlitMs; + private readonly IProgram _programStencilDrawToMs; + private readonly IProgram _programStencilDrawToNonMs; + + public HelperShader(VulkanRenderer gd, Device device) + { + _pipeline = new PipelineHelperShader(gd, device); + _pipeline.Initialize(); + + _samplerLinear = gd.CreateSampler(SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + _samplerNearest = gd.CreateSampler(SamplerCreateInfo.Create(MinFilter.Nearest, MagFilter.Nearest)); + + var blitResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Vertex, ResourceType.UniformBuffer, 1) + .Add(ResourceStages.Fragment, ResourceType.TextureAndSampler, 0).Build(); + + _programColorBlit = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorBlitFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + _programColorBlitMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorBlitMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + _programColorBlitClearAlpha = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorBlitClearAlphaFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + var colorClearResourceLayout = new ResourceLayoutBuilder().Add(ResourceStages.Vertex, ResourceType.UniformBuffer, 1).Build(); + + _programColorClearF = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorClearVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorClearFFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorClearResourceLayout); + + _programColorClearSI = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorClearVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorClearSIFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorClearResourceLayout); + + _programColorClearUI = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorClearVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorClearUIFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorClearResourceLayout); + + _programDepthStencilClear = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorClearVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("DepthStencilClearFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorClearResourceLayout); + + var strideChangeResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 0) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 1) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 2, true).Build(); + + _programStrideChange = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ChangeBufferStride.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, strideChangeResourceLayout); + + var colorCopyResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 0) + .Add(ResourceStages.Compute, ResourceType.TextureAndSampler, 0) + .Add(ResourceStages.Compute, ResourceType.Image, 0, true).Build(); + + _programColorCopyShortening = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorCopyShorteningCompute.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, colorCopyResourceLayout); + + _programColorCopyToNonMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorCopyToNonMsCompute.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, colorCopyResourceLayout); + + _programColorCopyWidening = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorCopyWideningCompute.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, colorCopyResourceLayout); + + var colorDrawToMsResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Fragment, ResourceType.UniformBuffer, 0) + .Add(ResourceStages.Fragment, ResourceType.TextureAndSampler, 0).Build(); + + _programColorDrawToMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorDrawToMsVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("ColorDrawToMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorDrawToMsResourceLayout); + + var convertD32S8ToD24S8ResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 0) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 1) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 2, true).Build(); + + _programConvertD32S8ToD24S8 = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ConvertD32S8ToD24S8.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, convertD32S8ToD24S8ResourceLayout); + + var convertIndexBufferResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 0) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 1) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 2, true).Build(); + + _programConvertIndexBuffer = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ConvertIndexBuffer.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, convertIndexBufferResourceLayout); + + var convertIndirectDataResourceLayout = new ResourceLayoutBuilder() + .Add(ResourceStages.Compute, ResourceType.UniformBuffer, 0) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 1) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 2, true) + .Add(ResourceStages.Compute, ResourceType.StorageBuffer, 3).Build(); + + _programConvertIndirectData = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ConvertIndirectData.spv"), ShaderStage.Compute, TargetLanguage.Spirv), + }, convertIndirectDataResourceLayout); + + _programDepthBlit = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("DepthBlitFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + _programDepthBlitMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("DepthBlitMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + _programDepthDrawToMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorDrawToMsVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("DepthDrawToMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorDrawToMsResourceLayout); + + _programDepthDrawToNonMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorDrawToMsVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("DepthDrawToNonMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorDrawToMsResourceLayout); + + if (gd.Capabilities.SupportsShaderStencilExport) + { + _programStencilBlit = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("StencilBlitFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + _programStencilBlitMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorBlitVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("StencilBlitMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, blitResourceLayout); + + _programStencilDrawToMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorDrawToMsVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("StencilDrawToMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorDrawToMsResourceLayout); + + _programStencilDrawToNonMs = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ReadSpirv("ColorDrawToMsVertex.spv"), ShaderStage.Vertex, TargetLanguage.Spirv), + new ShaderSource(ReadSpirv("StencilDrawToNonMsFragment.spv"), ShaderStage.Fragment, TargetLanguage.Spirv), + }, colorDrawToMsResourceLayout); + } + } + + private static byte[] ReadSpirv(string fileName) + { + return EmbeddedResources.Read(string.Join('/', ShaderBinariesPath, fileName)); + } + + public void Blit( + VulkanRenderer gd, + TextureView src, + TextureView dst, + Extents2D srcRegion, + Extents2D dstRegion, + int layers, + int levels, + bool isDepthOrStencil, + bool linearFilter, + bool clearAlpha = false) + { + gd.FlushAllCommands(); + + using var cbs = gd.CommandBufferPool.Rent(); + + for (int l = 0; l < levels; l++) + { + var mipSrcRegion = new Extents2D( + srcRegion.X1 >> l, + srcRegion.Y1 >> l, + srcRegion.X2 >> l, + srcRegion.Y2 >> l); + + var mipDstRegion = new Extents2D( + dstRegion.X1 >> l, + dstRegion.Y1 >> l, + dstRegion.X2 >> l, + dstRegion.Y2 >> l); + + for (int z = 0; z < layers; z++) + { + var srcView = Create2DLayerView(src, z, l); + var dstView = Create2DLayerView(dst, z, l); + + if (isDepthOrStencil) + { + BlitDepthStencil( + gd, + cbs, + srcView, + dstView, + mipSrcRegion, + mipDstRegion); + } + else + { + BlitColor( + gd, + cbs, + srcView, + dstView, + mipSrcRegion, + mipDstRegion, + linearFilter, + clearAlpha); + } + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + } + + public void CopyColor( + VulkanRenderer gd, + CommandBufferScoped cbs, + TextureView src, + TextureView dst, + int srcLayer, + int dstLayer, + int srcLevel, + int dstLevel, + int depth, + int levels) + { + for (int l = 0; l < levels; l++) + { + int mipSrcLevel = srcLevel + l; + int mipDstLevel = dstLevel + l; + + int srcWidth = Math.Max(1, src.Width >> mipSrcLevel); + int srcHeight = Math.Max(1, src.Height >> mipSrcLevel); + + int dstWidth = Math.Max(1, dst.Width >> mipDstLevel); + int dstHeight = Math.Max(1, dst.Height >> mipDstLevel); + + var extents = new Extents2D( + 0, + 0, + Math.Min(srcWidth, dstWidth), + Math.Min(srcHeight, dstHeight)); + + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, mipSrcLevel); + var dstView = Create2DLayerView(dst, dstLayer + z, mipDstLevel); + + BlitColor( + gd, + cbs, + srcView, + dstView, + extents, + extents, + false); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + } + + public void BlitColor( + VulkanRenderer gd, + CommandBufferScoped cbs, + TextureView src, + TextureView dst, + Extents2D srcRegion, + Extents2D dstRegion, + bool linearFilter, + bool clearAlpha = false) + { + _pipeline.SetCommandBuffer(cbs); + + const int RegionBufferSize = 16; + + var sampler = linearFilter ? _samplerLinear : _samplerNearest; + + _pipeline.SetTextureAndSamplerIdentitySwizzle(ShaderStage.Fragment, 0, src, sampler); + + Span region = stackalloc float[RegionBufferSize / sizeof(float)]; + + region[0] = (float)srcRegion.X1 / src.Width; + region[1] = (float)srcRegion.X2 / src.Width; + region[2] = (float)srcRegion.Y1 / src.Height; + region[3] = (float)srcRegion.Y2 / src.Height; + + if (dstRegion.X1 > dstRegion.X2) + { + (region[0], region[1]) = (region[1], region[0]); + } + + if (dstRegion.Y1 > dstRegion.Y2) + { + (region[2], region[3]) = (region[3], region[2]); + } + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, RegionBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, region); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(1, buffer.Range) }); + + Span viewports = stackalloc Viewport[1]; + + var rect = new Rectangle( + MathF.Min(dstRegion.X1, dstRegion.X2), + MathF.Min(dstRegion.Y1, dstRegion.Y2), + MathF.Abs(dstRegion.X2 - dstRegion.X1), + MathF.Abs(dstRegion.Y2 - dstRegion.Y1)); + + viewports[0] = new Viewport( + rect, + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + bool dstIsDepthOrStencil = dst.Info.Format.IsDepthOrStencil(); + + if (dstIsDepthOrStencil) + { + _pipeline.SetProgram(src.Info.Target.IsMultisample() ? _programDepthBlitMs : _programDepthBlit); + _pipeline.SetDepthTest(new DepthTestDescriptor(true, true, CompareOp.Always)); + } + else if (src.Info.Target.IsMultisample()) + { + _pipeline.SetProgram(_programColorBlitMs); + } + else if (clearAlpha) + { + _pipeline.SetProgram(_programColorBlitClearAlpha); + } + else + { + _pipeline.SetProgram(_programColorBlit); + } + + int dstWidth = dst.Width; + int dstHeight = dst.Height; + + _pipeline.SetRenderTarget(dst, (uint)dstWidth, (uint)dstHeight); + _pipeline.SetRenderTargetColorMasks(new uint[] { 0xf }); + _pipeline.SetScissors(stackalloc Rectangle[] { new Rectangle(0, 0, dstWidth, dstHeight) }); + + if (clearAlpha) + { + _pipeline.ClearRenderTargetColor(0, 0, 1, new ColorF(0f, 0f, 0f, 1f)); + } + + _pipeline.SetViewports(viewports); + _pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + _pipeline.Draw(4, 1, 0, 0); + + if (dstIsDepthOrStencil) + { + _pipeline.SetDepthTest(new DepthTestDescriptor(false, false, CompareOp.Always)); + } + + _pipeline.Finish(gd, cbs); + } + + private void BlitDepthStencil( + VulkanRenderer gd, + CommandBufferScoped cbs, + TextureView src, + TextureView dst, + Extents2D srcRegion, + Extents2D dstRegion) + { + _pipeline.SetCommandBuffer(cbs); + + const int RegionBufferSize = 16; + + Span region = stackalloc float[RegionBufferSize / sizeof(float)]; + + region[0] = (float)srcRegion.X1 / src.Width; + region[1] = (float)srcRegion.X2 / src.Width; + region[2] = (float)srcRegion.Y1 / src.Height; + region[3] = (float)srcRegion.Y2 / src.Height; + + if (dstRegion.X1 > dstRegion.X2) + { + (region[0], region[1]) = (region[1], region[0]); + } + + if (dstRegion.Y1 > dstRegion.Y2) + { + (region[2], region[3]) = (region[3], region[2]); + } + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, RegionBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, region); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(1, buffer.Range) }); + + Span viewports = stackalloc Viewport[1]; + + var rect = new Rectangle( + MathF.Min(dstRegion.X1, dstRegion.X2), + MathF.Min(dstRegion.Y1, dstRegion.Y2), + MathF.Abs(dstRegion.X2 - dstRegion.X1), + MathF.Abs(dstRegion.Y2 - dstRegion.Y1)); + + viewports[0] = new Viewport( + rect, + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + int dstWidth = dst.Width; + int dstHeight = dst.Height; + + _pipeline.SetRenderTarget(dst, (uint)dstWidth, (uint)dstHeight); + _pipeline.SetScissors(stackalloc Rectangle[] { new Rectangle(0, 0, dstWidth, dstHeight) }); + _pipeline.SetViewports(viewports); + _pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + + var aspectFlags = src.Info.Format.ConvertAspectFlags(); + + if (aspectFlags.HasFlag(ImageAspectFlags.DepthBit)) + { + var depthTexture = CreateDepthOrStencilView(src, DepthStencilMode.Depth); + + BlitDepthStencilDraw(depthTexture, isDepth: true); + + if (depthTexture != src) + { + depthTexture.Release(); + } + } + + if (aspectFlags.HasFlag(ImageAspectFlags.StencilBit) && _programStencilBlit != null) + { + var stencilTexture = CreateDepthOrStencilView(src, DepthStencilMode.Stencil); + + BlitDepthStencilDraw(stencilTexture, isDepth: false); + + if (stencilTexture != src) + { + stencilTexture.Release(); + } + } + + _pipeline.Finish(gd, cbs); + } + + private static TextureView CreateDepthOrStencilView(TextureView depthStencilTexture, DepthStencilMode depthStencilMode) + { + if (depthStencilTexture.Info.DepthStencilMode == depthStencilMode) + { + return depthStencilTexture; + } + + return (TextureView)depthStencilTexture.CreateView(new TextureCreateInfo( + depthStencilTexture.Info.Width, + depthStencilTexture.Info.Height, + depthStencilTexture.Info.Depth, + depthStencilTexture.Info.Levels, + depthStencilTexture.Info.Samples, + depthStencilTexture.Info.BlockWidth, + depthStencilTexture.Info.BlockHeight, + depthStencilTexture.Info.BytesPerPixel, + depthStencilTexture.Info.Format, + depthStencilMode, + depthStencilTexture.Info.Target, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha), 0, 0); + } + + private void BlitDepthStencilDraw(TextureView src, bool isDepth) + { + _pipeline.SetTextureAndSamplerIdentitySwizzle(ShaderStage.Fragment, 0, src, _samplerNearest); + + if (isDepth) + { + _pipeline.SetProgram(src.Info.Target.IsMultisample() ? _programDepthBlitMs : _programDepthBlit); + _pipeline.SetDepthTest(new DepthTestDescriptor(true, true, CompareOp.Always)); + } + else + { + _pipeline.SetProgram(src.Info.Target.IsMultisample() ? _programStencilBlitMs : _programStencilBlit); + _pipeline.SetStencilTest(CreateStencilTestDescriptor(true)); + } + + _pipeline.Draw(4, 1, 0, 0); + + if (isDepth) + { + _pipeline.SetDepthTest(new DepthTestDescriptor(false, false, CompareOp.Always)); + } + else + { + _pipeline.SetStencilTest(CreateStencilTestDescriptor(false)); + } + } + + private static StencilTestDescriptor CreateStencilTestDescriptor( + bool enabled, + int refValue = 0, + int compareMask = 0xff, + int writeMask = 0xff) + { + return new StencilTestDescriptor( + enabled, + CompareOp.Always, + StencilOp.Replace, + StencilOp.Replace, + StencilOp.Replace, + refValue, + compareMask, + writeMask, + CompareOp.Always, + StencilOp.Replace, + StencilOp.Replace, + StencilOp.Replace, + refValue, + compareMask, + writeMask); + } + + public void Clear( + VulkanRenderer gd, + TextureView dst, + ReadOnlySpan clearColor, + uint componentMask, + int dstWidth, + int dstHeight, + ComponentType type, + Rectangle scissor) + { + const int ClearColorBufferSize = 16; + + gd.FlushAllCommands(); + + using var cbs = gd.CommandBufferPool.Rent(); + + _pipeline.SetCommandBuffer(cbs); + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ClearColorBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, clearColor); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(1, buffer.Range) }); + + Span viewports = stackalloc Viewport[1]; + + viewports[0] = new Viewport( + new Rectangle(0, 0, dstWidth, dstHeight), + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + IProgram program; + + if (type == ComponentType.SignedInteger) + { + program = _programColorClearSI; + } + else if (type == ComponentType.UnsignedInteger) + { + program = _programColorClearUI; + } + else + { + program = _programColorClearF; + } + + _pipeline.SetProgram(program); + _pipeline.SetRenderTarget(dst, (uint)dstWidth, (uint)dstHeight); + _pipeline.SetRenderTargetColorMasks(new[] { componentMask }); + _pipeline.SetViewports(viewports); + _pipeline.SetScissors(stackalloc Rectangle[] { scissor }); + _pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + _pipeline.Draw(4, 1, 0, 0); + _pipeline.Finish(); + } + + public void Clear( + VulkanRenderer gd, + TextureView dst, + float depthValue, + bool depthMask, + int stencilValue, + int stencilMask, + int dstWidth, + int dstHeight, + VkFormat dstFormat, + Rectangle scissor) + { + const int ClearColorBufferSize = 16; + + gd.FlushAllCommands(); + + using var cbs = gd.CommandBufferPool.Rent(); + + _pipeline.SetCommandBuffer(cbs); + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ClearColorBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, stackalloc float[] { depthValue }); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(1, buffer.Range) }); + + Span viewports = stackalloc Viewport[1]; + + viewports[0] = new Viewport( + new Rectangle(0, 0, dstWidth, dstHeight), + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + _pipeline.SetProgram(_programDepthStencilClear); + _pipeline.SetRenderTarget(dst, (uint)dstWidth, (uint)dstHeight); + _pipeline.SetViewports(viewports); + _pipeline.SetScissors(stackalloc Rectangle[] { scissor }); + _pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + _pipeline.SetDepthTest(new DepthTestDescriptor(true, depthMask, CompareOp.Always)); + _pipeline.SetStencilTest(CreateStencilTestDescriptor(stencilMask != 0, stencilValue, 0xff, stencilMask)); + _pipeline.Draw(4, 1, 0, 0); + _pipeline.Finish(); + } + + public void DrawTexture( + VulkanRenderer gd, + PipelineBase pipeline, + TextureView src, + ISampler srcSampler, + Extents2DF srcRegion, + Extents2DF dstRegion) + { + const int RegionBufferSize = 16; + + pipeline.SetTextureAndSampler(ShaderStage.Fragment, 0, src, srcSampler); + + Span region = stackalloc float[RegionBufferSize / sizeof(float)]; + + region[0] = srcRegion.X1 / src.Width; + region[1] = srcRegion.X2 / src.Width; + region[2] = srcRegion.Y1 / src.Height; + region[3] = srcRegion.Y2 / src.Height; + + if (dstRegion.X1 > dstRegion.X2) + { + (region[0], region[1]) = (region[1], region[0]); + } + + if (dstRegion.Y1 > dstRegion.Y2) + { + (region[2], region[3]) = (region[3], region[2]); + } + + var bufferHandle = gd.BufferManager.CreateWithHandle(gd, RegionBufferSize); + + gd.BufferManager.SetData(bufferHandle, 0, region); + + pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(1, new BufferRange(bufferHandle, 0, RegionBufferSize)) }); + + Span viewports = stackalloc Viewport[1]; + + var rect = new Rectangle( + MathF.Min(dstRegion.X1, dstRegion.X2), + MathF.Min(dstRegion.Y1, dstRegion.Y2), + MathF.Abs(dstRegion.X2 - dstRegion.X1), + MathF.Abs(dstRegion.Y2 - dstRegion.Y1)); + + viewports[0] = new Viewport( + rect, + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + pipeline.SetProgram(_programColorBlit); + pipeline.SetViewports(viewports); + pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + pipeline.Draw(4, 1, 0, 0); + + gd.BufferManager.Delete(bufferHandle); + } + + public void ConvertI8ToI16(VulkanRenderer gd, CommandBufferScoped cbs, BufferHolder src, BufferHolder dst, int srcOffset, int size) + { + ChangeStride(gd, cbs, src, dst, srcOffset, size, 1, 2); + } + + public unsafe void ChangeStride(VulkanRenderer gd, CommandBufferScoped cbs, BufferHolder src, BufferHolder dst, int srcOffset, int size, int stride, int newStride) + { + bool supportsUint8 = gd.Capabilities.SupportsShaderInt8; + + int elems = size / stride; + int newSize = elems * newStride; + + var srcBufferAuto = src.GetBuffer(); + var dstBufferAuto = dst.GetBuffer(); + + var srcBuffer = srcBufferAuto.Get(cbs, srcOffset, size).Value; + var dstBuffer = dstBufferAuto.Get(cbs, 0, newSize).Value; + + var access = supportsUint8 ? AccessFlags.ShaderWriteBit : AccessFlags.TransferWriteBit; + var stage = supportsUint8 ? PipelineStageFlags.ComputeShaderBit : PipelineStageFlags.TransferBit; + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + BufferHolder.DefaultAccessFlags, + access, + PipelineStageFlags.AllCommandsBit, + stage, + 0, + newSize); + + if (supportsUint8) + { + const int ParamsBufferSize = 16; + + Span shaderParams = stackalloc int[ParamsBufferSize / sizeof(int)]; + + shaderParams[0] = stride; + shaderParams[1] = newStride; + shaderParams[2] = size; + shaderParams[3] = srcOffset; + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ParamsBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, shaderParams); + + _pipeline.SetCommandBuffer(cbs); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, buffer.Range) }); + + Span> sbRanges = new Auto[2]; + + sbRanges[0] = srcBufferAuto; + sbRanges[1] = dstBufferAuto; + + _pipeline.SetStorageBuffers(1, sbRanges); + + _pipeline.SetProgram(_programStrideChange); + _pipeline.DispatchCompute(1 + elems / ConvertElementsPerWorkgroup, 1, 1); + + _pipeline.Finish(gd, cbs); + } + else + { + gd.Api.CmdFillBuffer(cbs.CommandBuffer, dstBuffer, 0, Vk.WholeSize, 0); + + var bufferCopy = new BufferCopy[elems]; + + for (ulong i = 0; i < (ulong)elems; i++) + { + bufferCopy[i] = new BufferCopy((ulong)srcOffset + i * (ulong)stride, i * (ulong)newStride, (ulong)stride); + } + + fixed (BufferCopy* pBufferCopy = bufferCopy) + { + gd.Api.CmdCopyBuffer(cbs.CommandBuffer, srcBuffer, dstBuffer, (uint)elems, pBufferCopy); + } + } + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + access, + BufferHolder.DefaultAccessFlags, + stage, + PipelineStageFlags.AllCommandsBit, + 0, + newSize); + } + + public unsafe void ConvertIndexBuffer(VulkanRenderer gd, + CommandBufferScoped cbs, + BufferHolder src, + BufferHolder dst, + IndexBufferPattern pattern, + int indexSize, + int srcOffset, + int indexCount) + { + // TODO: Support conversion with primitive restart enabled. + // TODO: Convert with a compute shader? + + int convertedCount = pattern.GetConvertedCount(indexCount); + int outputIndexSize = 4; + + var srcBuffer = src.GetBuffer().Get(cbs, srcOffset, indexCount * indexSize).Value; + var dstBuffer = dst.GetBuffer().Get(cbs, 0, convertedCount * outputIndexSize).Value; + + gd.Api.CmdFillBuffer(cbs.CommandBuffer, dstBuffer, 0, Vk.WholeSize, 0); + + var bufferCopy = new List(); + int outputOffset = 0; + + // Try to merge copies of adjacent indices to reduce copy count. + int sequenceStart = 0; + int sequenceLength = 0; + + foreach (var index in pattern.GetIndexMapping(indexCount)) + { + if (sequenceLength > 0) + { + if (index == sequenceStart + sequenceLength && indexSize == outputIndexSize) + { + sequenceLength++; + continue; + } + + // Commit the copy so far. + bufferCopy.Add(new BufferCopy((ulong)(srcOffset + sequenceStart * indexSize), (ulong)outputOffset, (ulong)(indexSize * sequenceLength))); + outputOffset += outputIndexSize * sequenceLength; + } + + sequenceStart = index; + sequenceLength = 1; + } + + if (sequenceLength > 0) + { + // Commit final pending copy. + bufferCopy.Add(new BufferCopy((ulong)(srcOffset + sequenceStart * indexSize), (ulong)outputOffset, (ulong)(indexSize * sequenceLength))); + } + + var bufferCopyArray = bufferCopy.ToArray(); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + BufferHolder.DefaultAccessFlags, + AccessFlags.TransferWriteBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + 0, + convertedCount * outputIndexSize); + + fixed (BufferCopy* pBufferCopy = bufferCopyArray) + { + gd.Api.CmdCopyBuffer(cbs.CommandBuffer, srcBuffer, dstBuffer, (uint)bufferCopyArray.Length, pBufferCopy); + } + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + AccessFlags.TransferWriteBit, + BufferHolder.DefaultAccessFlags, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + 0, + convertedCount * outputIndexSize); + } + + public void CopyIncompatibleFormats( + VulkanRenderer gd, + CommandBufferScoped cbs, + TextureView src, + TextureView dst, + int srcLayer, + int dstLayer, + int srcLevel, + int dstLevel, + int depth, + int levels) + { + const int ParamsBufferSize = 4; + + Span shaderParams = stackalloc int[ParamsBufferSize / sizeof(int)]; + + int srcBpp = src.Info.BytesPerPixel; + int dstBpp = dst.Info.BytesPerPixel; + + int ratio = srcBpp < dstBpp ? dstBpp / srcBpp : srcBpp / dstBpp; + + shaderParams[0] = BitOperations.Log2((uint)ratio); + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ParamsBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, shaderParams); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + src.GetImage().Get(cbs).Value, + TextureStorage.DefaultAccessMask, + AccessFlags.ShaderReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.ComputeShaderBit, + ImageAspectFlags.ColorBit, + src.FirstLayer + srcLayer, + src.FirstLevel + srcLevel, + depth, + levels); + + _pipeline.SetCommandBuffer(cbs); + + _pipeline.SetProgram(srcBpp < dstBpp ? _programColorCopyWidening : _programColorCopyShortening); + + // Calculate ideal component size, given our constraints: + // - Component size must not exceed bytes per pixel of source and destination image formats. + // - Maximum component size is 4 (R32). + int componentSize = Math.Min(Math.Min(srcBpp, dstBpp), 4); + + var srcFormat = GetFormat(componentSize, srcBpp / componentSize); + var dstFormat = GetFormat(componentSize, dstBpp / componentSize); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, buffer.Range) }); + + for (int l = 0; l < levels; l++) + { + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, srcLevel + l, srcFormat); + var dstView = Create2DLayerView(dst, dstLayer + z, dstLevel + l); + + _pipeline.SetTextureAndSamplerIdentitySwizzle(ShaderStage.Compute, 0, srcView, null); + _pipeline.SetImage(ShaderStage.Compute, 0, dstView.GetView(dstFormat)); + + int dispatchX = (Math.Min(srcView.Info.Width, dstView.Info.Width) + 31) / 32; + int dispatchY = (Math.Min(srcView.Info.Height, dstView.Info.Height) + 31) / 32; + + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + + _pipeline.Finish(gd, cbs); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + dst.GetImage().Get(cbs).Value, + AccessFlags.ShaderWriteBit, + TextureStorage.DefaultAccessMask, + PipelineStageFlags.ComputeShaderBit, + PipelineStageFlags.AllCommandsBit, + ImageAspectFlags.ColorBit, + dst.FirstLayer + dstLayer, + dst.FirstLevel + dstLevel, + depth, + levels); + } + + public void CopyMSToNonMS(VulkanRenderer gd, CommandBufferScoped cbs, TextureView src, TextureView dst, int srcLayer, int dstLayer, int depth) + { + const int ParamsBufferSize = 16; + + Span shaderParams = stackalloc int[ParamsBufferSize / sizeof(int)]; + + int samples = src.Info.Samples; + bool isDepthOrStencil = src.Info.Format.IsDepthOrStencil(); + var aspectFlags = src.Info.Format.ConvertAspectFlags(); + + // X and Y are the expected texture samples. + // Z and W are the actual texture samples used. + // They may differ if the GPU does not support the samples count requested and we had to use a lower amount. + (shaderParams[0], shaderParams[1]) = GetSampleCountXYLog2(samples); + (shaderParams[2], shaderParams[3]) = GetSampleCountXYLog2((int)TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, (uint)samples)); + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ParamsBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, shaderParams); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + src.GetImage().Get(cbs).Value, + TextureStorage.DefaultAccessMask, + AccessFlags.ShaderReadBit, + PipelineStageFlags.AllCommandsBit, + isDepthOrStencil ? PipelineStageFlags.FragmentShaderBit : PipelineStageFlags.ComputeShaderBit, + aspectFlags, + src.FirstLayer + srcLayer, + src.FirstLevel, + depth, + 1); + + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, buffer.Range) }); + + if (isDepthOrStencil) + { + // We can't use compute for this case because compute can't modify depth textures. + + Span viewports = stackalloc Viewport[1]; + + var rect = new Rectangle(0, 0, dst.Width, dst.Height); + + viewports[0] = new Viewport( + rect, + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + _pipeline.SetScissors(stackalloc Rectangle[] { new Rectangle(0, 0, dst.Width, dst.Height) }); + _pipeline.SetViewports(viewports); + _pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, 0); + var dstView = Create2DLayerView(dst, dstLayer + z, 0); + + _pipeline.SetRenderTarget(dstView, (uint)dst.Width, (uint)dst.Height); + + CopyMSDraw(srcView, aspectFlags, fromMS: true); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + else + { + var format = GetFormat(src.Info.BytesPerPixel); + + int dispatchX = (dst.Info.Width + 31) / 32; + int dispatchY = (dst.Info.Height + 31) / 32; + + _pipeline.SetProgram(_programColorCopyToNonMs); + + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, 0, format); + var dstView = Create2DLayerView(dst, dstLayer + z, 0); + + _pipeline.SetTextureAndSamplerIdentitySwizzle(ShaderStage.Compute, 0, srcView, null); + _pipeline.SetImage(ShaderStage.Compute, 0, dstView.GetView(format)); + + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + + _pipeline.Finish(gd, cbs); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + dst.GetImage().Get(cbs).Value, + isDepthOrStencil ? AccessFlags.DepthStencilAttachmentWriteBit : AccessFlags.ShaderWriteBit, + TextureStorage.DefaultAccessMask, + isDepthOrStencil ? PipelineStageFlags.LateFragmentTestsBit : PipelineStageFlags.ComputeShaderBit, + PipelineStageFlags.AllCommandsBit, + aspectFlags, + dst.FirstLayer + dstLayer, + dst.FirstLevel, + depth, + 1); + } + + public void CopyNonMSToMS(VulkanRenderer gd, CommandBufferScoped cbs, TextureView src, TextureView dst, int srcLayer, int dstLayer, int depth) + { + const int ParamsBufferSize = 16; + + Span shaderParams = stackalloc int[ParamsBufferSize / sizeof(int)]; + + int samples = dst.Info.Samples; + bool isDepthOrStencil = src.Info.Format.IsDepthOrStencil(); + var aspectFlags = src.Info.Format.ConvertAspectFlags(); + + // X and Y are the expected texture samples. + // Z and W are the actual texture samples used. + // They may differ if the GPU does not support the samples count requested and we had to use a lower amount. + (shaderParams[0], shaderParams[1]) = GetSampleCountXYLog2(samples); + (shaderParams[2], shaderParams[3]) = GetSampleCountXYLog2((int)TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, (uint)samples)); + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ParamsBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, shaderParams); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + src.GetImage().Get(cbs).Value, + TextureStorage.DefaultAccessMask, + AccessFlags.ShaderReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.FragmentShaderBit, + aspectFlags, + src.FirstLayer + srcLayer, + src.FirstLevel, + depth, + 1); + + _pipeline.SetCommandBuffer(cbs); + + Span viewports = stackalloc Viewport[1]; + + var rect = new Rectangle(0, 0, dst.Width, dst.Height); + + viewports[0] = new Viewport( + rect, + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + _pipeline.SetRenderTargetColorMasks(new uint[] { 0xf }); + _pipeline.SetScissors(stackalloc Rectangle[] { new Rectangle(0, 0, dst.Width, dst.Height) }); + _pipeline.SetViewports(viewports); + _pipeline.SetPrimitiveTopology(PrimitiveTopology.TriangleStrip); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, buffer.Range) }); + + if (isDepthOrStencil) + { + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, 0); + var dstView = Create2DLayerView(dst, dstLayer + z, 0); + + _pipeline.SetRenderTarget(dstView, (uint)dst.Width, (uint)dst.Height); + + CopyMSDraw(srcView, aspectFlags, fromMS: false); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + else + { + _pipeline.SetProgram(_programColorDrawToMs); + + var format = GetFormat(src.Info.BytesPerPixel); + var vkFormat = FormatTable.GetFormat(format); + + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, 0, format); + var dstView = Create2DLayerView(dst, dstLayer + z, 0); + + _pipeline.SetTextureAndSamplerIdentitySwizzle(ShaderStage.Fragment, 0, srcView, null); + _pipeline.SetRenderTarget(dstView.GetView(format), (uint)dst.Width, (uint)dst.Height); + + _pipeline.Draw(4, 1, 0, 0); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + + _pipeline.Finish(gd, cbs); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + dst.GetImage().Get(cbs).Value, + isDepthOrStencil ? AccessFlags.DepthStencilAttachmentWriteBit : AccessFlags.ColorAttachmentWriteBit, + TextureStorage.DefaultAccessMask, + isDepthOrStencil ? PipelineStageFlags.LateFragmentTestsBit : PipelineStageFlags.ColorAttachmentOutputBit, + PipelineStageFlags.AllCommandsBit, + aspectFlags, + dst.FirstLayer + dstLayer, + dst.FirstLevel, + depth, + 1); + } + + private void CopyMSDraw(TextureView src, ImageAspectFlags aspectFlags, bool fromMS) + { + if (aspectFlags.HasFlag(ImageAspectFlags.DepthBit)) + { + var depthTexture = CreateDepthOrStencilView(src, DepthStencilMode.Depth); + + CopyMSAspectDraw(depthTexture, fromMS, isDepth: true); + + if (depthTexture != src) + { + depthTexture.Release(); + } + } + + if (aspectFlags.HasFlag(ImageAspectFlags.StencilBit) && _programStencilDrawToMs != null) + { + var stencilTexture = CreateDepthOrStencilView(src, DepthStencilMode.Stencil); + + CopyMSAspectDraw(stencilTexture, fromMS, isDepth: false); + + if (stencilTexture != src) + { + stencilTexture.Release(); + } + } + } + + private void CopyMSAspectDraw(TextureView src, bool fromMS, bool isDepth) + { + _pipeline.SetTextureAndSamplerIdentitySwizzle(ShaderStage.Fragment, 0, src, _samplerNearest); + + if (isDepth) + { + _pipeline.SetProgram(fromMS ? _programDepthDrawToNonMs : _programDepthDrawToMs); + _pipeline.SetDepthTest(new DepthTestDescriptor(true, true, CompareOp.Always)); + } + else + { + _pipeline.SetProgram(fromMS ? _programStencilDrawToNonMs : _programStencilDrawToMs); + _pipeline.SetStencilTest(CreateStencilTestDescriptor(true)); + } + + _pipeline.Draw(4, 1, 0, 0); + + if (isDepth) + { + _pipeline.SetDepthTest(new DepthTestDescriptor(false, false, CompareOp.Always)); + } + else + { + _pipeline.SetStencilTest(CreateStencilTestDescriptor(false)); + } + } + + private static (int, int) GetSampleCountXYLog2(int samples) + { + int samplesInXLog2 = 0; + int samplesInYLog2 = 0; + + switch (samples) + { + case 2: // 2x1 + samplesInXLog2 = 1; + break; + case 4: // 2x2 + samplesInXLog2 = 1; + samplesInYLog2 = 1; + break; + case 8: // 4x2 + samplesInXLog2 = 2; + samplesInYLog2 = 1; + break; + case 16: // 4x4 + samplesInXLog2 = 2; + samplesInYLog2 = 2; + break; + case 32: // 8x4 + samplesInXLog2 = 3; + samplesInYLog2 = 2; + break; + case 64: // 8x8 + samplesInXLog2 = 3; + samplesInYLog2 = 3; + break; + } + + return (samplesInXLog2, samplesInYLog2); + } + + private static TextureView Create2DLayerView(TextureView from, int layer, int level, Format? format = null) + { + if (from.Info.Target == Target.Texture2D && level == 0 && (format == null || format.Value == from.Info.Format)) + { + return from; + } + + var target = from.Info.Target switch + { + Target.Texture1DArray => Target.Texture1D, + Target.Texture2DMultisampleArray => Target.Texture2DMultisample, + _ => Target.Texture2D, + }; + + var info = new TextureCreateInfo( + Math.Max(1, from.Info.Width >> level), + Math.Max(1, from.Info.Height >> level), + 1, + 1, + from.Info.Samples, + from.Info.BlockWidth, + from.Info.BlockHeight, + from.Info.BytesPerPixel, + format ?? from.Info.Format, + from.Info.DepthStencilMode, + target, + from.Info.SwizzleR, + from.Info.SwizzleG, + from.Info.SwizzleB, + from.Info.SwizzleA); + + return from.CreateViewImpl(info, layer, level); + } + + private static Format GetFormat(int bytesPerPixel) + { + return bytesPerPixel switch + { + 1 => Format.R8Uint, + 2 => Format.R16Uint, + 4 => Format.R32Uint, + 8 => Format.R32G32Uint, + 16 => Format.R32G32B32A32Uint, + _ => throw new ArgumentException($"Invalid bytes per pixel {bytesPerPixel}."), + }; + } + + private static Format GetFormat(int componentSize, int componentsCount) + { + if (componentSize == 1) + { + return componentsCount switch + { + 1 => Format.R8Uint, + 2 => Format.R8G8Uint, + 4 => Format.R8G8B8A8Uint, + _ => throw new ArgumentException($"Invalid components count {componentsCount}."), + }; + } + + if (componentSize == 2) + { + return componentsCount switch + { + 1 => Format.R16Uint, + 2 => Format.R16G16Uint, + 4 => Format.R16G16B16A16Uint, + _ => throw new ArgumentException($"Invalid components count {componentsCount}."), + }; + } + + if (componentSize == 4) + { + return componentsCount switch + { + 1 => Format.R32Uint, + 2 => Format.R32G32Uint, + 4 => Format.R32G32B32A32Uint, + _ => throw new ArgumentException($"Invalid components count {componentsCount}."), + }; + } + + throw new ArgumentException($"Invalid component size {componentSize}."); + } + + public void ConvertIndexBufferIndirect( + VulkanRenderer gd, + CommandBufferScoped cbs, + BufferHolder srcIndirectBuffer, + BufferHolder dstIndirectBuffer, + BufferRange drawCountBuffer, + BufferHolder srcIndexBuffer, + BufferHolder dstIndexBuffer, + IndexBufferPattern pattern, + int indexSize, + int srcIndexBufferOffset, + int srcIndexBufferSize, + int srcIndirectBufferOffset, + bool hasDrawCount, + int maxDrawCount, + int indirectDataStride) + { + // TODO: Support conversion with primitive restart enabled. + + BufferRange drawCountBufferAligned = new( + drawCountBuffer.Handle, + drawCountBuffer.Offset & ~(UniformBufferAlignment - 1), + UniformBufferAlignment); + + int indirectDataSize = maxDrawCount * indirectDataStride; + + int indexCount = srcIndexBufferSize / indexSize; + int primitivesCount = pattern.GetPrimitiveCount(indexCount); + int convertedCount = pattern.GetConvertedCount(indexCount); + int outputIndexSize = 4; + + var srcBuffer = srcIndexBuffer.GetBuffer().Get(cbs, srcIndexBufferOffset, indexCount * indexSize).Value; + var dstBuffer = dstIndexBuffer.GetBuffer().Get(cbs, 0, convertedCount * outputIndexSize).Value; + + const int ParamsBufferSize = 24 * sizeof(int); + const int ParamsIndirectDispatchOffset = 16 * sizeof(int); + const int ParamsIndirectDispatchSize = 3 * sizeof(int); + + Span shaderParams = stackalloc int[ParamsBufferSize / sizeof(int)]; + + shaderParams[8] = pattern.PrimitiveVertices; + shaderParams[9] = pattern.PrimitiveVerticesOut; + shaderParams[10] = indexSize; + shaderParams[11] = outputIndexSize; + shaderParams[12] = pattern.BaseIndex; + shaderParams[13] = pattern.IndexStride; + shaderParams[14] = srcIndexBufferOffset; + shaderParams[15] = primitivesCount; + shaderParams[16] = 1; + shaderParams[17] = 1; + shaderParams[18] = 1; + shaderParams[19] = hasDrawCount ? 1 : 0; + shaderParams[20] = maxDrawCount; + shaderParams[21] = (drawCountBuffer.Offset & (UniformBufferAlignment - 1)) / 4; + shaderParams[22] = indirectDataStride / 4; + shaderParams[23] = srcIndirectBufferOffset / 4; + + pattern.OffsetIndex.CopyTo(shaderParams[..pattern.OffsetIndex.Length]); + + using var patternScoped = gd.BufferManager.ReserveOrCreate(gd, cbs, ParamsBufferSize); + var patternBuffer = patternScoped.Holder; + var patternBufferAuto = patternBuffer.GetBuffer(); + + patternBuffer.SetDataUnchecked(patternScoped.Offset, shaderParams); + + _pipeline.SetCommandBuffer(cbs); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + srcIndirectBuffer.GetBuffer().Get(cbs, srcIndirectBufferOffset, indirectDataSize).Value, + BufferHolder.DefaultAccessFlags, + AccessFlags.ShaderReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.ComputeShaderBit, + srcIndirectBufferOffset, + indirectDataSize); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, drawCountBufferAligned) }); + _pipeline.SetStorageBuffers(1, new[] { srcIndirectBuffer.GetBuffer(), dstIndirectBuffer.GetBuffer() }); + _pipeline.SetStorageBuffers(stackalloc[] { new BufferAssignment(3, patternScoped.Range) }); + + _pipeline.SetProgram(_programConvertIndirectData); + _pipeline.DispatchCompute(1, 1, 1); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + patternBufferAuto.Get(cbs, patternScoped.Offset + ParamsIndirectDispatchOffset, ParamsIndirectDispatchSize).Value, + AccessFlags.ShaderWriteBit, + AccessFlags.IndirectCommandReadBit, + PipelineStageFlags.ComputeShaderBit, + PipelineStageFlags.DrawIndirectBit, + patternScoped.Offset + ParamsIndirectDispatchOffset, + ParamsIndirectDispatchSize); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + BufferHolder.DefaultAccessFlags, + AccessFlags.TransferWriteBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + 0, + convertedCount * outputIndexSize); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, new BufferRange(patternScoped.Handle, patternScoped.Offset, ParamsBufferSize)) }); + _pipeline.SetStorageBuffers(1, new[] { srcIndexBuffer.GetBuffer(), dstIndexBuffer.GetBuffer() }); + + _pipeline.SetProgram(_programConvertIndexBuffer); + _pipeline.DispatchComputeIndirect(patternBufferAuto, patternScoped.Offset + ParamsIndirectDispatchOffset); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + AccessFlags.TransferWriteBit, + BufferHolder.DefaultAccessFlags, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + 0, + convertedCount * outputIndexSize); + + _pipeline.Finish(gd, cbs); + } + + public unsafe void ConvertD32S8ToD24S8(VulkanRenderer gd, CommandBufferScoped cbs, BufferHolder src, Auto dstBufferAuto, int pixelCount, int dstOffset) + { + int inSize = pixelCount * 2 * sizeof(int); + int outSize = pixelCount * sizeof(int); + + var srcBufferAuto = src.GetBuffer(); + + var srcBuffer = srcBufferAuto.Get(cbs, 0, inSize).Value; + var dstBuffer = dstBufferAuto.Get(cbs, dstOffset, outSize).Value; + + var access = AccessFlags.ShaderWriteBit; + var stage = PipelineStageFlags.ComputeShaderBit; + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + srcBuffer, + BufferHolder.DefaultAccessFlags, + AccessFlags.ShaderReadBit, + PipelineStageFlags.AllCommandsBit, + stage, + 0, + outSize); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + BufferHolder.DefaultAccessFlags, + access, + PipelineStageFlags.AllCommandsBit, + stage, + 0, + outSize); + + const int ParamsBufferSize = sizeof(int) * 2; + + Span shaderParams = stackalloc int[2]; + + shaderParams[0] = pixelCount; + shaderParams[1] = dstOffset; + + using var buffer = gd.BufferManager.ReserveOrCreate(gd, cbs, ParamsBufferSize); + + buffer.Holder.SetDataUnchecked(buffer.Offset, shaderParams); + + _pipeline.SetCommandBuffer(cbs); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, buffer.Range) }); + + Span> sbRanges = new Auto[2]; + + sbRanges[0] = srcBufferAuto; + sbRanges[1] = dstBufferAuto; + + _pipeline.SetStorageBuffers(1, sbRanges); + + _pipeline.SetProgram(_programConvertD32S8ToD24S8); + _pipeline.DispatchCompute(1 + inSize / ConvertElementsPerWorkgroup, 1, 1); + + _pipeline.Finish(gd, cbs); + + BufferHolder.InsertBufferBarrier( + gd, + cbs.CommandBuffer, + dstBuffer, + access, + BufferHolder.DefaultAccessFlags, + stage, + PipelineStageFlags.AllCommandsBit, + 0, + outSize); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + _programColorBlitClearAlpha.Dispose(); + _programColorBlit.Dispose(); + _programColorBlitMs.Dispose(); + _programColorClearF.Dispose(); + _programColorClearSI.Dispose(); + _programColorClearUI.Dispose(); + _programDepthStencilClear.Dispose(); + _programStrideChange.Dispose(); + _programConvertIndexBuffer.Dispose(); + _programConvertIndirectData.Dispose(); + _programColorCopyShortening.Dispose(); + _programColorCopyToNonMs.Dispose(); + _programColorCopyWidening.Dispose(); + _programColorDrawToMs.Dispose(); + _programDepthBlit.Dispose(); + _programDepthBlitMs.Dispose(); + _programDepthDrawToMs.Dispose(); + _programDepthDrawToNonMs.Dispose(); + _programStencilBlit?.Dispose(); + _programStencilBlitMs?.Dispose(); + _programStencilDrawToMs?.Dispose(); + _programStencilDrawToNonMs?.Dispose(); + _samplerNearest.Dispose(); + _samplerLinear.Dispose(); + _pipeline.Dispose(); + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/HostMemoryAllocator.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/HostMemoryAllocator.cs new file mode 100644 index 000000000..a80ec1f0e --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/HostMemoryAllocator.cs @@ -0,0 +1,189 @@ +using Ryujinx.Common; +using Ryujinx.Common.Collections; +using Ryujinx.Common.Logging; +using Silk.NET.Vulkan; +using Silk.NET.Vulkan.Extensions.EXT; +using System; +using System.Collections.Generic; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class HostMemoryAllocator + { + private readonly struct HostMemoryAllocation + { + public readonly Auto Allocation; + public readonly nint Pointer; + public readonly ulong Size; + + public ulong Start => (ulong)Pointer; + public ulong End => (ulong)Pointer + Size; + + public HostMemoryAllocation(Auto allocation, nint pointer, ulong size) + { + Allocation = allocation; + Pointer = pointer; + Size = size; + } + } + + private readonly MemoryAllocator _allocator; + private readonly Vk _api; + private readonly ExtExternalMemoryHost _hostMemoryApi; + private readonly Device _device; + private readonly Lock _lock = new(); + + private readonly List _allocations; + private readonly IntervalTree _allocationTree; + + public HostMemoryAllocator(MemoryAllocator allocator, Vk api, ExtExternalMemoryHost hostMemoryApi, Device device) + { + _allocator = allocator; + _api = api; + _hostMemoryApi = hostMemoryApi; + _device = device; + + _allocations = new List(); + _allocationTree = new IntervalTree(); + } + + public unsafe bool TryImport( + MemoryRequirements requirements, + MemoryPropertyFlags flags, + nint pointer, + ulong size) + { + lock (_lock) + { + // Does a compatible allocation exist in the tree? + var allocations = new HostMemoryAllocation[10]; + + ulong start = (ulong)pointer; + ulong end = start + size; + + int count = _allocationTree.Get(start, end, ref allocations); + + // A compatible range is one that where the start and end completely cover the requested range. + for (int i = 0; i < count; i++) + { + HostMemoryAllocation existing = allocations[i]; + + if (start >= existing.Start && end <= existing.End) + { + try + { + existing.Allocation.IncrementReferenceCount(); + + return true; + } + catch (InvalidOperationException) + { + // Can throw if the allocation has been disposed. + // Just continue the search if this happens. + } + } + } + + nint pageAlignedPointer = BitUtils.AlignDown(pointer, Environment.SystemPageSize); + nint pageAlignedEnd = BitUtils.AlignUp((nint)((ulong)pointer + size), Environment.SystemPageSize); + ulong pageAlignedSize = (ulong)(pageAlignedEnd - pageAlignedPointer); + + Result getResult = _hostMemoryApi.GetMemoryHostPointerProperties(_device, ExternalMemoryHandleTypeFlags.HostAllocationBitExt, (void*)pageAlignedPointer, out MemoryHostPointerPropertiesEXT properties); + if (getResult < Result.Success) + { + return false; + } + + int memoryTypeIndex = _allocator.FindSuitableMemoryTypeIndex(properties.MemoryTypeBits & requirements.MemoryTypeBits, flags); + if (memoryTypeIndex < 0) + { + return false; + } + + ImportMemoryHostPointerInfoEXT importInfo = new() + { + SType = StructureType.ImportMemoryHostPointerInfoExt, + HandleType = ExternalMemoryHandleTypeFlags.HostAllocationBitExt, + PHostPointer = (void*)pageAlignedPointer, + }; + + var memoryAllocateInfo = new MemoryAllocateInfo + { + SType = StructureType.MemoryAllocateInfo, + AllocationSize = pageAlignedSize, + MemoryTypeIndex = (uint)memoryTypeIndex, + PNext = &importInfo, + }; + + Result result = _api.AllocateMemory(_device, in memoryAllocateInfo, null, out var deviceMemory); + + if (result < Result.Success) + { + Logger.Debug?.PrintMsg(LogClass.Gpu, $"Host mapping import 0x{pageAlignedPointer:x16} 0x{pageAlignedSize:x8} failed."); + return false; + } + + var allocation = new MemoryAllocation(this, deviceMemory, pageAlignedPointer, 0, pageAlignedSize); + var allocAuto = new Auto(allocation); + var hostAlloc = new HostMemoryAllocation(allocAuto, pageAlignedPointer, pageAlignedSize); + + allocAuto.IncrementReferenceCount(); + allocAuto.Dispose(); // Kept alive by ref count only. + + // Register this mapping for future use. + + _allocationTree.Add(hostAlloc.Start, hostAlloc.End, hostAlloc); + _allocations.Add(hostAlloc); + } + + return true; + } + + public (Auto, ulong) GetExistingAllocation(nint pointer, ulong size) + { + lock (_lock) + { + // Does a compatible allocation exist in the tree? + var allocations = new HostMemoryAllocation[10]; + + ulong start = (ulong)pointer; + ulong end = start + size; + + int count = _allocationTree.Get(start, end, ref allocations); + + // A compatible range is one that where the start and end completely cover the requested range. + for (int i = 0; i < count; i++) + { + HostMemoryAllocation existing = allocations[i]; + + if (start >= existing.Start && end <= existing.End) + { + return (existing.Allocation, start - existing.Start); + } + } + + throw new InvalidOperationException($"No host allocation was prepared for requested range 0x{pointer:x16}:0x{size:x16}."); + } + } + + public void Free(DeviceMemory memory, ulong offset, ulong size) + { + lock (_lock) + { + _allocations.RemoveAll(allocation => + { + if (allocation.Allocation.GetUnsafe().Memory.Handle == memory.Handle) + { + _allocationTree.Remove(allocation.Start, allocation); + return true; + } + + return false; + }); + } + + _api.FreeMemory(_device, memory, ReadOnlySpan.Empty); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/IdList.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/IdList.cs new file mode 100644 index 000000000..60b3cb486 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/IdList.cs @@ -0,0 +1,121 @@ +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class IdList where T : class + { + private readonly List _list; + private int _freeMin; + + public IdList() + { + _list = new List(); + _freeMin = 0; + } + + public int Add(T value) + { + int id; + int count = _list.Count; + id = _list.IndexOf(null, _freeMin); + + if ((uint)id < (uint)count) + { + _list[id] = value; + } + else + { + id = count; + _freeMin = id + 1; + + _list.Add(value); + } + + return id + 1; + } + + public void Remove(int id) + { + id--; + + int count = _list.Count; + + if ((uint)id >= (uint)count) + { + return; + } + + if (id + 1 == count) + { + // Trim unused items. + int removeIndex = id; + + while (removeIndex > 0 && _list[removeIndex - 1] == null) + { + removeIndex--; + } + + _list.RemoveRange(removeIndex, count - removeIndex); + + if (_freeMin > removeIndex) + { + _freeMin = removeIndex; + } + } + else + { + _list[id] = null; + + if (_freeMin > id) + { + _freeMin = id; + } + } + } + + public bool TryGetValue(int id, out T value) + { + id--; + + try + { + if ((uint)id < (uint)_list.Count) + { + value = _list[id]; + return value != null; + } + + value = null; + return false; + } + catch (ArgumentOutOfRangeException) + { + value = null; + return false; + } + catch (IndexOutOfRangeException) + { + value = null; + return false; + } + } + + public void Clear() + { + _list.Clear(); + _freeMin = 0; + } + + public IEnumerator GetEnumerator() + { + for (int i = 0; i < _list.Count; i++) + { + if (_list[i] != null) + { + yield return _list[i]; + } + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/ImageArray.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/ImageArray.cs new file mode 100644 index 000000000..c110c6eb9 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/ImageArray.cs @@ -0,0 +1,207 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class ImageArray : ResourceArray, IImageArray + { + private readonly VulkanRenderer _gd; + + private record struct TextureRef + { + public TextureStorage Storage; + public TextureView View; + } + + private readonly TextureRef[] _textureRefs; + private readonly TextureBuffer[] _bufferTextureRefs; + + private readonly DescriptorImageInfo[] _textures; + private readonly BufferView[] _bufferTextures; + + private HashSet _storages; + + private int _cachedCommandBufferIndex; + private int _cachedSubmissionCount; + + private readonly bool _isBuffer; + + public ImageArray(VulkanRenderer gd, int size, bool isBuffer) + { + _gd = gd; + + if (isBuffer) + { + _bufferTextureRefs = new TextureBuffer[size]; + _bufferTextures = new BufferView[size]; + } + else + { + _textureRefs = new TextureRef[size]; + _textures = new DescriptorImageInfo[size]; + } + + _storages = null; + + _cachedCommandBufferIndex = -1; + _cachedSubmissionCount = 0; + + _isBuffer = isBuffer; + } + + public void SetImages(int index, ITexture[] images) + { + for (int i = 0; i < images.Length; i++) + { + ITexture image = images[i]; + + if (image is TextureBuffer textureBuffer) + { + _bufferTextureRefs[index + i] = textureBuffer; + } + else if (image is TextureView view) + { + _textureRefs[index + i].Storage = view.Storage; + _textureRefs[index + i].View = view; + } + else if (!_isBuffer) + { + _textureRefs[index + i].Storage = null; + _textureRefs[index + i].View = default; + } + else + { + _bufferTextureRefs[index + i] = null; + } + } + + SetDirty(); + } + + private void SetDirty() + { + _cachedCommandBufferIndex = -1; + _storages = null; + SetDirty(_gd, isImage: true); + } + + public void QueueWriteToReadBarriers(CommandBufferScoped cbs, PipelineStageFlags stageFlags) + { + HashSet storages = _storages; + + if (storages == null) + { + storages = new HashSet(); + + for (int index = 0; index < _textureRefs.Length; index++) + { + if (_textureRefs[index].Storage != null) + { + storages.Add(_textureRefs[index].Storage); + } + } + + _storages = storages; + } + + foreach (TextureStorage storage in storages) + { + storage.QueueWriteToReadBarrier(cbs, AccessFlags.ShaderReadBit, stageFlags); + } + } + + public ReadOnlySpan GetImageInfos(VulkanRenderer gd, CommandBufferScoped cbs, TextureView dummyTexture) + { + int submissionCount = gd.CommandBufferPool.GetSubmissionCount(cbs.CommandBufferIndex); + + Span textures = _textures; + + if (cbs.CommandBufferIndex == _cachedCommandBufferIndex && submissionCount == _cachedSubmissionCount) + { + return textures; + } + + _cachedCommandBufferIndex = cbs.CommandBufferIndex; + _cachedSubmissionCount = submissionCount; + + for (int i = 0; i < textures.Length; i++) + { + ref var texture = ref textures[i]; + ref var refs = ref _textureRefs[i]; + + if (i > 0 && _textureRefs[i - 1].View == refs.View) + { + texture = textures[i - 1]; + + continue; + } + + texture.ImageLayout = ImageLayout.General; + texture.ImageView = refs.View?.GetIdentityImageView().Get(cbs).Value ?? default; + + if (texture.ImageView.Handle == 0) + { + texture.ImageView = dummyTexture.GetImageView().Get(cbs).Value; + } + } + + return textures; + } + + public ReadOnlySpan GetBufferViews(CommandBufferScoped cbs) + { + Span bufferTextures = _bufferTextures; + + for (int i = 0; i < bufferTextures.Length; i++) + { + bufferTextures[i] = _bufferTextureRefs[i]?.GetBufferView(cbs, true) ?? default; + } + + return bufferTextures; + } + + public DescriptorSet[] GetDescriptorSets( + Device device, + CommandBufferScoped cbs, + DescriptorSetTemplateUpdater templateUpdater, + ShaderCollection program, + int setIndex, + TextureView dummyTexture) + { + if (TryGetCachedDescriptorSets(cbs, program, setIndex, out DescriptorSet[] sets)) + { + // We still need to ensure the current command buffer holds a reference to all used textures. + + if (!_isBuffer) + { + GetImageInfos(_gd, cbs, dummyTexture); + } + else + { + GetBufferViews(cbs); + } + + return sets; + } + + DescriptorSetTemplate template = program.Templates[setIndex]; + + DescriptorSetTemplateWriter tu = templateUpdater.Begin(template); + + if (!_isBuffer) + { + tu.Push(GetImageInfos(_gd, cbs, dummyTexture)); + } + else + { + tu.Push(GetBufferViews(cbs)); + } + + templateUpdater.Commit(_gd, device, sets[0]); + + return sets; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferPattern.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferPattern.cs new file mode 100644 index 000000000..45b3aa081 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferPattern.cs @@ -0,0 +1,139 @@ +using Ryujinx.Graphics.GAL; +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class IndexBufferPattern : IDisposable + { + public int PrimitiveVertices { get; } + public int PrimitiveVerticesOut { get; } + public int BaseIndex { get; } + public int[] OffsetIndex { get; } + public int IndexStride { get; } + public bool RepeatStart { get; } + + private readonly VulkanRenderer _gd; + private int _currentSize; + private BufferHandle _repeatingBuffer; + + public IndexBufferPattern(VulkanRenderer gd, + int primitiveVertices, + int primitiveVerticesOut, + int baseIndex, + int[] offsetIndex, + int indexStride, + bool repeatStart) + { + PrimitiveVertices = primitiveVertices; + PrimitiveVerticesOut = primitiveVerticesOut; + BaseIndex = baseIndex; + OffsetIndex = offsetIndex; + IndexStride = indexStride; + RepeatStart = repeatStart; + + _gd = gd; + } + + public int GetPrimitiveCount(int vertexCount) + { + return Math.Max(0, (vertexCount - BaseIndex) / IndexStride); + } + + public int GetConvertedCount(int indexCount) + { + int primitiveCount = GetPrimitiveCount(indexCount); + return primitiveCount * OffsetIndex.Length; + } + + public IEnumerable GetIndexMapping(int indexCount) + { + int primitiveCount = GetPrimitiveCount(indexCount); + int index = BaseIndex; + + for (int i = 0; i < primitiveCount; i++) + { + if (RepeatStart) + { + // Used for triangle fan + yield return 0; + } + + for (int j = RepeatStart ? 1 : 0; j < OffsetIndex.Length; j++) + { + yield return index + OffsetIndex[j]; + } + + index += IndexStride; + } + } + + public BufferHandle GetRepeatingBuffer(int vertexCount, out int indexCount) + { + int primitiveCount = GetPrimitiveCount(vertexCount); + indexCount = primitiveCount * PrimitiveVerticesOut; + + int expectedSize = primitiveCount * OffsetIndex.Length; + + if (expectedSize <= _currentSize && _repeatingBuffer != BufferHandle.Null) + { + return _repeatingBuffer; + } + + // Expand the repeating pattern to the number of requested primitives. + BufferHandle newBuffer = _gd.BufferManager.CreateWithHandle(_gd, expectedSize * sizeof(int)); + + // Copy the old data to the new one. + if (_repeatingBuffer != BufferHandle.Null) + { + _gd.Pipeline.CopyBuffer(_repeatingBuffer, newBuffer, 0, 0, _currentSize * sizeof(int)); + _gd.DeleteBuffer(_repeatingBuffer); + } + + _repeatingBuffer = newBuffer; + + // Add the additional repeats on top. + int newPrimitives = primitiveCount; + int oldPrimitives = (_currentSize) / OffsetIndex.Length; + + int[] newData; + + newPrimitives -= oldPrimitives; + newData = new int[expectedSize - _currentSize]; + + int outOffset = 0; + int index = oldPrimitives * IndexStride + BaseIndex; + + for (int i = 0; i < newPrimitives; i++) + { + if (RepeatStart) + { + // Used for triangle fan + newData[outOffset++] = 0; + } + + for (int j = RepeatStart ? 1 : 0; j < OffsetIndex.Length; j++) + { + newData[outOffset++] = index + OffsetIndex[j]; + } + + index += IndexStride; + } + + _gd.SetBufferData(newBuffer, _currentSize * sizeof(int), MemoryMarshal.Cast(newData)); + _currentSize = expectedSize; + + return newBuffer; + } + + public void Dispose() + { + if (_repeatingBuffer != BufferHandle.Null) + { + _gd.DeleteBuffer(_repeatingBuffer); + _repeatingBuffer = BufferHandle.Null; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferState.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferState.cs new file mode 100644 index 000000000..ac0b35c98 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/IndexBufferState.cs @@ -0,0 +1,171 @@ +using Ryujinx.Graphics.GAL; +using IndexType = Silk.NET.Vulkan.IndexType; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal struct IndexBufferState + { + private const int IndexBufferMaxMirrorable = 0x20000; + + public static IndexBufferState Null => new(BufferHandle.Null, 0, 0); + + private readonly int _offset; + private readonly int _size; + private readonly IndexType _type; + + private readonly BufferHandle _handle; + private Auto _buffer; + + public IndexBufferState(BufferHandle handle, int offset, int size, IndexType type) + { + _handle = handle; + _offset = offset; + _size = size; + _type = type; + _buffer = null; + } + + public IndexBufferState(BufferHandle handle, int offset, int size) + { + _handle = handle; + _offset = offset; + _size = size; + _type = IndexType.Uint16; + _buffer = null; + } + + public void BindIndexBuffer(VulkanRenderer gd, CommandBufferScoped cbs) + { + Auto autoBuffer; + int offset, size; + IndexType type = _type; + bool mirrorable = false; + + if (_type == IndexType.Uint8Ext && !gd.Capabilities.SupportsIndexTypeUint8) + { + // Index type is not supported. Convert to I16. + autoBuffer = gd.BufferManager.GetBufferI8ToI16(cbs, _handle, _offset, _size); + + type = IndexType.Uint16; + offset = 0; + size = _size * 2; + } + else + { + autoBuffer = gd.BufferManager.GetBuffer(cbs.CommandBuffer, _handle, false, out int bufferSize); + + if (_offset >= bufferSize) + { + autoBuffer = null; + } + + mirrorable = _size < IndexBufferMaxMirrorable; + + offset = _offset; + size = _size; + } + + _buffer = autoBuffer; + + if (autoBuffer != null) + { + DisposableBuffer buffer = mirrorable ? autoBuffer.GetMirrorable(cbs, ref offset, size, out _) : autoBuffer.Get(cbs, offset, size); + + gd.Api.CmdBindIndexBuffer(cbs.CommandBuffer, buffer.Value, (ulong)offset, type); + } + } + + public void BindConvertedIndexBuffer( + VulkanRenderer gd, + CommandBufferScoped cbs, + int firstIndex, + int indexCount, + int convertedCount, + IndexBufferPattern pattern) + { + Auto autoBuffer; + + // Convert the index buffer using the given pattern. + int indexSize = GetIndexSize(); + + int firstIndexOffset = firstIndex * indexSize; + + autoBuffer = gd.BufferManager.GetBufferTopologyConversion(cbs, _handle, _offset + firstIndexOffset, indexCount * indexSize, pattern, indexSize); + + int size = convertedCount * 4; + + _buffer = autoBuffer; + + if (autoBuffer != null) + { + gd.Api.CmdBindIndexBuffer(cbs.CommandBuffer, autoBuffer.Get(cbs, 0, size).Value, 0, IndexType.Uint32); + } + } + + public Auto BindConvertedIndexBufferIndirect( + VulkanRenderer gd, + CommandBufferScoped cbs, + BufferRange indirectBuffer, + BufferRange drawCountBuffer, + IndexBufferPattern pattern, + bool hasDrawCount, + int maxDrawCount, + int indirectDataStride) + { + // Convert the index buffer using the given pattern. + int indexSize = GetIndexSize(); + + (var indexBufferAuto, var indirectBufferAuto) = gd.BufferManager.GetBufferTopologyConversionIndirect( + gd, + cbs, + new BufferRange(_handle, _offset, _size), + indirectBuffer, + drawCountBuffer, + pattern, + indexSize, + hasDrawCount, + maxDrawCount, + indirectDataStride); + + int convertedCount = pattern.GetConvertedCount(_size / indexSize); + int size = convertedCount * 4; + + _buffer = indexBufferAuto; + + if (indexBufferAuto != null) + { + gd.Api.CmdBindIndexBuffer(cbs.CommandBuffer, indexBufferAuto.Get(cbs, 0, size).Value, 0, IndexType.Uint32); + } + + return indirectBufferAuto; + } + + private readonly int GetIndexSize() + { + return _type switch + { + IndexType.Uint32 => 4, + IndexType.Uint16 => 2, + _ => 1, + }; + } + + public readonly bool BoundEquals(Auto buffer) + { + return _buffer == buffer; + } + + public void Swap(Auto from, Auto to) + { + if (_buffer == from) + { + _buffer = to; + } + } + + public readonly bool Overlaps(Auto buffer, int offset, int size) + { + return buffer == _buffer && offset < _offset + _size && offset + size > _offset; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocation.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocation.cs new file mode 100644 index 000000000..3a26af419 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocation.cs @@ -0,0 +1,59 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct MemoryAllocation : IDisposable + { + private readonly MemoryAllocatorBlockList _owner; + private readonly MemoryAllocatorBlockList.Block _block; + private readonly HostMemoryAllocator _hostMemory; + + public DeviceMemory Memory { get; } + public nint HostPointer { get; } + public ulong Offset { get; } + public ulong Size { get; } + + public MemoryAllocation( + MemoryAllocatorBlockList owner, + MemoryAllocatorBlockList.Block block, + DeviceMemory memory, + nint hostPointer, + ulong offset, + ulong size) + { + _owner = owner; + _block = block; + Memory = memory; + HostPointer = hostPointer; + Offset = offset; + Size = size; + } + + public MemoryAllocation( + HostMemoryAllocator hostMemory, + DeviceMemory memory, + nint hostPointer, + ulong offset, + ulong size) + { + _hostMemory = hostMemory; + Memory = memory; + HostPointer = hostPointer; + Offset = offset; + Size = size; + } + + public void Dispose() + { + if (_hostMemory != null) + { + _hostMemory.Free(Memory, Offset, Size); + } + else + { + _owner.Free(_block, Offset, Size); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocator.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocator.cs new file mode 100644 index 000000000..80404f24f --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocator.cs @@ -0,0 +1,118 @@ +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class MemoryAllocator : IDisposable + { + private const ulong MaxDeviceMemoryUsageEstimate = 16UL * 1024 * 1024 * 1024; + + private readonly Vk _api; + private readonly VulkanPhysicalDevice _physicalDevice; + private readonly Device _device; + private readonly List _blockLists; + private readonly int _blockAlignment; + private readonly ReaderWriterLockSlim _lock; + + public MemoryAllocator(Vk api, VulkanPhysicalDevice physicalDevice, Device device) + { + _api = api; + _physicalDevice = physicalDevice; + _device = device; + _blockLists = new List(); + _blockAlignment = (int)Math.Min(int.MaxValue, MaxDeviceMemoryUsageEstimate / _physicalDevice.PhysicalDeviceProperties.Limits.MaxMemoryAllocationCount); + _lock = new(LockRecursionPolicy.NoRecursion); + } + + public MemoryAllocation AllocateDeviceMemory( + MemoryRequirements requirements, + MemoryPropertyFlags flags = 0, + bool isBuffer = false) + { + int memoryTypeIndex = FindSuitableMemoryTypeIndex(requirements.MemoryTypeBits, flags); + if (memoryTypeIndex < 0) + { + return default; + } + + bool map = flags.HasFlag(MemoryPropertyFlags.HostVisibleBit); + return Allocate(memoryTypeIndex, requirements.Size, requirements.Alignment, map, isBuffer); + } + + private MemoryAllocation Allocate(int memoryTypeIndex, ulong size, ulong alignment, bool map, bool isBuffer) + { + _lock.EnterReadLock(); + + try + { + for (int i = 0; i < _blockLists.Count; i++) + { + var bl = _blockLists[i]; + if (bl.MemoryTypeIndex == memoryTypeIndex && bl.ForBuffer == isBuffer) + { + return bl.Allocate(size, alignment, map); + } + } + } + finally + { + _lock.ExitReadLock(); + } + + _lock.EnterWriteLock(); + + try + { + var newBl = new MemoryAllocatorBlockList(_api, _device, memoryTypeIndex, _blockAlignment, isBuffer); + _blockLists.Add(newBl); + + return newBl.Allocate(size, alignment, map); + } + finally + { + _lock.ExitWriteLock(); + } + } + + internal int FindSuitableMemoryTypeIndex(uint memoryTypeBits, MemoryPropertyFlags flags) + { + for (int i = 0; i < _physicalDevice.PhysicalDeviceMemoryProperties.MemoryTypeCount; i++) + { + var type = _physicalDevice.PhysicalDeviceMemoryProperties.MemoryTypes[i]; + + if ((memoryTypeBits & (1 << i)) != 0) + { + if (type.PropertyFlags.HasFlag(flags)) + { + return i; + } + } + } + + return -1; + } + + public static bool IsDeviceMemoryShared(VulkanPhysicalDevice physicalDevice) + { + for (int i = 0; i < physicalDevice.PhysicalDeviceMemoryProperties.MemoryHeapCount; i++) + { + if (!physicalDevice.PhysicalDeviceMemoryProperties.MemoryHeaps[i].Flags.HasFlag(MemoryHeapFlags.DeviceLocalBit)) + { + return false; + } + } + + return true; + } + + public void Dispose() + { + for (int i = 0; i < _blockLists.Count; i++) + { + _blockLists[i].Dispose(); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocatorBlockList.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocatorBlockList.cs new file mode 100644 index 000000000..b835b6222 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/MemoryAllocatorBlockList.cs @@ -0,0 +1,310 @@ +using Ryujinx.Common; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class MemoryAllocatorBlockList : IDisposable + { + private const ulong InvalidOffset = ulong.MaxValue; + + public class Block : IComparable + { + public DeviceMemory Memory { get; private set; } + public nint HostPointer { get; private set; } + public ulong Size { get; } + public bool Mapped => HostPointer != nint.Zero; + + private readonly struct Range : IComparable + { + public ulong Offset { get; } + public ulong Size { get; } + + public Range(ulong offset, ulong size) + { + Offset = offset; + Size = size; + } + + public int CompareTo(Range other) + { + return Offset.CompareTo(other.Offset); + } + } + + private readonly List _freeRanges; + + public Block(DeviceMemory memory, nint hostPointer, ulong size) + { + Memory = memory; + HostPointer = hostPointer; + Size = size; + _freeRanges = new List + { + new Range(0, size), + }; + } + + public ulong Allocate(ulong size, ulong alignment) + { + for (int i = 0; i < _freeRanges.Count; i++) + { + var range = _freeRanges[i]; + + ulong alignedOffset = BitUtils.AlignUp(range.Offset, alignment); + ulong sizeDelta = alignedOffset - range.Offset; + ulong usableSize = range.Size - sizeDelta; + + if (sizeDelta < range.Size && usableSize >= size) + { + _freeRanges.RemoveAt(i); + + if (sizeDelta != 0) + { + InsertFreeRange(range.Offset, sizeDelta); + } + + ulong endOffset = range.Offset + range.Size; + ulong remainingSize = endOffset - (alignedOffset + size); + if (remainingSize != 0) + { + InsertFreeRange(endOffset - remainingSize, remainingSize); + } + + return alignedOffset; + } + } + + return InvalidOffset; + } + + public void Free(ulong offset, ulong size) + { + InsertFreeRangeComingled(offset, size); + } + + private void InsertFreeRange(ulong offset, ulong size) + { + var range = new Range(offset, size); + int index = _freeRanges.BinarySearch(range); + if (index < 0) + { + index = ~index; + } + + _freeRanges.Insert(index, range); + } + + private void InsertFreeRangeComingled(ulong offset, ulong size) + { + ulong endOffset = offset + size; + var range = new Range(offset, size); + int index = _freeRanges.BinarySearch(range); + if (index < 0) + { + index = ~index; + } + + if (index < _freeRanges.Count && _freeRanges[index].Offset == endOffset) + { + endOffset = _freeRanges[index].Offset + _freeRanges[index].Size; + _freeRanges.RemoveAt(index); + } + + if (index > 0 && _freeRanges[index - 1].Offset + _freeRanges[index - 1].Size == offset) + { + offset = _freeRanges[index - 1].Offset; + _freeRanges.RemoveAt(--index); + } + + range = new Range(offset, endOffset - offset); + + _freeRanges.Insert(index, range); + } + + public bool IsTotallyFree() + { + if (_freeRanges.Count == 1 && _freeRanges[0].Size == Size) + { + Debug.Assert(_freeRanges[0].Offset == 0); + return true; + } + + return false; + } + + public int CompareTo(Block other) + { + return Size.CompareTo(other.Size); + } + + public unsafe void Destroy(Vk api, Device device) + { + if (Mapped) + { + api.UnmapMemory(device, Memory); + HostPointer = nint.Zero; + } + + if (Memory.Handle != 0) + { + api.FreeMemory(device, Memory, null); + Memory = default; + } + } + } + + private readonly List _blocks; + + private readonly Vk _api; + private readonly Device _device; + + public int MemoryTypeIndex { get; } + public bool ForBuffer { get; } + + private readonly int _blockAlignment; + + private readonly ReaderWriterLockSlim _lock; + + public MemoryAllocatorBlockList(Vk api, Device device, int memoryTypeIndex, int blockAlignment, bool forBuffer) + { + _blocks = new List(); + _api = api; + _device = device; + MemoryTypeIndex = memoryTypeIndex; + ForBuffer = forBuffer; + _blockAlignment = blockAlignment; + _lock = new(LockRecursionPolicy.NoRecursion); + } + + public unsafe MemoryAllocation Allocate(ulong size, ulong alignment, bool map) + { + // Ensure we have a sane alignment value. + if ((ulong)(int)alignment != alignment || (int)alignment <= 0) + { + throw new ArgumentOutOfRangeException(nameof(alignment), $"Invalid alignment 0x{alignment:X}."); + } + + _lock.EnterReadLock(); + + try + { + for (int i = 0; i < _blocks.Count; i++) + { + var block = _blocks[i]; + + if (block.Mapped == map && block.Size >= size) + { + ulong offset = block.Allocate(size, alignment); + if (offset != InvalidOffset) + { + return new MemoryAllocation(this, block, block.Memory, GetHostPointer(block, offset), offset, size); + } + } + } + } + finally + { + _lock.ExitReadLock(); + } + + ulong blockAlignedSize = BitUtils.AlignUp(size, (ulong)_blockAlignment); + + var memoryAllocateInfo = new MemoryAllocateInfo + { + SType = StructureType.MemoryAllocateInfo, + AllocationSize = blockAlignedSize, + MemoryTypeIndex = (uint)MemoryTypeIndex, + }; + + _api.AllocateMemory(_device, in memoryAllocateInfo, null, out var deviceMemory).ThrowOnError(); + + nint hostPointer = nint.Zero; + + if (map) + { + void* pointer = null; + _api.MapMemory(_device, deviceMemory, 0, blockAlignedSize, 0, ref pointer).ThrowOnError(); + hostPointer = (nint)pointer; + } + + var newBlock = new Block(deviceMemory, hostPointer, blockAlignedSize); + + InsertBlock(newBlock); + + ulong newBlockOffset = newBlock.Allocate(size, alignment); + Debug.Assert(newBlockOffset != InvalidOffset); + + return new MemoryAllocation(this, newBlock, deviceMemory, GetHostPointer(newBlock, newBlockOffset), newBlockOffset, size); + } + + private static nint GetHostPointer(Block block, ulong offset) + { + if (block.HostPointer == nint.Zero) + { + return nint.Zero; + } + + return (nint)((nuint)block.HostPointer + offset); + } + + public void Free(Block block, ulong offset, ulong size) + { + block.Free(offset, size); + + if (block.IsTotallyFree()) + { + _lock.EnterWriteLock(); + + try + { + for (int i = 0; i < _blocks.Count; i++) + { + if (_blocks[i] == block) + { + _blocks.RemoveAt(i); + break; + } + } + } + finally + { + _lock.ExitWriteLock(); + } + + block.Destroy(_api, _device); + } + } + + private void InsertBlock(Block block) + { + _lock.EnterWriteLock(); + + try + { + int index = _blocks.BinarySearch(block); + if (index < 0) + { + index = ~index; + } + + _blocks.Insert(index, block); + } + finally + { + _lock.ExitWriteLock(); + } + } + + public void Dispose() + { + for (int i = 0; i < _blocks.Count; i++) + { + _blocks[i].Destroy(_api, _device); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/MultiFenceHolder.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/MultiFenceHolder.cs new file mode 100644 index 000000000..2ea4f43b0 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/MultiFenceHolder.cs @@ -0,0 +1,267 @@ +using Ryujinx.Common.Memory; +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + ///

+ /// Holder for multiple host GPU fences. + /// + class MultiFenceHolder + { + private const int BufferUsageTrackingGranularity = 4096; + + private readonly FenceHolder[] _fences; + private readonly BufferUsageBitmap _bufferUsageBitmap; + + /// + /// Creates a new instance of the multiple fence holder. + /// + public MultiFenceHolder() + { + _fences = new FenceHolder[CommandBufferPool.MaxCommandBuffers]; + } + + /// + /// Creates a new instance of the multiple fence holder, with a given buffer size in mind. + /// + /// Size of the buffer + public MultiFenceHolder(int size) + { + _fences = new FenceHolder[CommandBufferPool.MaxCommandBuffers]; + _bufferUsageBitmap = new BufferUsageBitmap(size, BufferUsageTrackingGranularity); + } + + /// + /// Adds read/write buffer usage information to the uses list. + /// + /// Index of the command buffer where the buffer is used + /// Offset of the buffer being used + /// Size of the buffer region being used, in bytes + /// Whether the access is a write or not + public void AddBufferUse(int cbIndex, int offset, int size, bool write) + { + _bufferUsageBitmap.Add(cbIndex, offset, size, false); + + if (write) + { + _bufferUsageBitmap.Add(cbIndex, offset, size, true); + } + } + + /// + /// Removes all buffer usage information for a given command buffer. + /// + /// Index of the command buffer where the buffer is used + public void RemoveBufferUses(int cbIndex) + { + _bufferUsageBitmap?.Clear(cbIndex); + } + + /// + /// Checks if a given range of a buffer is being used by a command buffer still being processed by the GPU. + /// + /// Index of the command buffer where the buffer is used + /// Offset of the buffer being used + /// Size of the buffer region being used, in bytes + /// True if in use, false otherwise + public bool IsBufferRangeInUse(int cbIndex, int offset, int size) + { + return _bufferUsageBitmap.OverlapsWith(cbIndex, offset, size); + } + + /// + /// Checks if a given range of a buffer is being used by any command buffer still being processed by the GPU. + /// + /// Offset of the buffer being used + /// Size of the buffer region being used, in bytes + /// True if only write usages should count + /// True if in use, false otherwise + public bool IsBufferRangeInUse(int offset, int size, bool write) + { + return _bufferUsageBitmap.OverlapsWith(offset, size, write); + } + + /// + /// Adds a fence to the holder. + /// + /// Command buffer index of the command buffer that owns the fence + /// Fence to be added + /// True if the command buffer's previous fence value was null + public bool AddFence(int cbIndex, FenceHolder fence) + { + ref FenceHolder fenceRef = ref _fences[cbIndex]; + + if (fenceRef == null) + { + fenceRef = fence; + return true; + } + + return false; + } + + /// + /// Removes a fence from the holder. + /// + /// Command buffer index of the command buffer that owns the fence + public void RemoveFence(int cbIndex) + { + _fences[cbIndex] = null; + } + + /// + /// Determines if a fence referenced on the given command buffer. + /// + /// Index of the command buffer to check if it's used + /// True if referenced, false otherwise + public bool HasFence(int cbIndex) + { + return _fences[cbIndex] != null; + } + + /// + /// Wait until all the fences on the holder are signaled. + /// + /// Vulkan API instance + /// GPU device that the fences belongs to + public void WaitForFences(Vk api, Device device) + { + WaitForFencesImpl(api, device, 0, 0, false, 0UL); + } + + /// + /// Wait until all the fences on the holder with buffer uses overlapping the specified range are signaled. + /// + /// Vulkan API instance + /// GPU device that the fences belongs to + /// Start offset of the buffer range + /// Size of the buffer range in bytes + public void WaitForFences(Vk api, Device device, int offset, int size) + { + WaitForFencesImpl(api, device, offset, size, false, 0UL); + } + + /// + /// Wait until all the fences on the holder are signaled, or the timeout expires. + /// + /// Vulkan API instance + /// GPU device that the fences belongs to + /// Timeout in nanoseconds + /// True if all fences were signaled, false otherwise + public bool WaitForFences(Vk api, Device device, ulong timeout) + { + return WaitForFencesImpl(api, device, 0, 0, true, timeout); + } + + /// + /// Wait until all the fences on the holder with buffer uses overlapping the specified range are signaled. + /// + /// Vulkan API instance + /// GPU device that the fences belongs to + /// Start offset of the buffer range + /// Size of the buffer range in bytes + /// Indicates if should be used + /// Timeout in nanoseconds + /// True if all fences were signaled before the timeout expired, false otherwise + private bool WaitForFencesImpl(Vk api, Device device, int offset, int size, bool hasTimeout, ulong timeout) + { + using SpanOwner fenceHoldersOwner = SpanOwner.Rent(CommandBufferPool.MaxCommandBuffers); + Span fenceHolders = fenceHoldersOwner.Span; + + int count = size != 0 ? GetOverlappingFences(fenceHolders, offset, size) : GetFences(fenceHolders); + Span fences = stackalloc Fence[count]; + + int fenceCount = 0; + + for (int i = 0; i < fences.Length; i++) + { + if (fenceHolders[i].TryGet(out Fence fence)) + { + fences[fenceCount] = fence; + + if (fenceCount < i) + { + fenceHolders[fenceCount] = fenceHolders[i]; + } + + fenceCount++; + } + } + + if (fenceCount == 0) + { + return true; + } + + bool signaled = true; + + try + { + if (hasTimeout) + { + signaled = FenceHelper.AllSignaled(api, device, fences[..fenceCount], timeout); + } + else + { + FenceHelper.WaitAllIndefinitely(api, device, fences[..fenceCount]); + } + } + finally + { + for (int i = 0; i < fenceCount; i++) + { + fenceHolders[i].PutLock(); + } + } + + return signaled; + } + + /// + /// Gets fences to wait for. + /// + /// Span to store fences in + /// Number of fences placed in storage + private int GetFences(Span storage) + { + int count = 0; + + for (int i = 0; i < _fences.Length; i++) + { + var fence = _fences[i]; + + if (fence != null) + { + storage[count++] = fence; + } + } + + return count; + } + + /// + /// Gets fences to wait for use of a given buffer region. + /// + /// Span to store overlapping fences in + /// Offset of the range + /// Size of the range in bytes + /// Number of fences for the specified region placed in storage + private int GetOverlappingFences(Span storage, int offset, int size) + { + int count = 0; + + for (int i = 0; i < _fences.Length; i++) + { + var fence = _fences[i]; + + if (fence != null && _bufferUsageBitmap.OverlapsWith(i, offset, size)) + { + storage[count++] = fence; + } + } + + return count; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/NativeArray.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/NativeArray.cs new file mode 100644 index 000000000..241fdce03 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/NativeArray.cs @@ -0,0 +1,48 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + unsafe class NativeArray : IDisposable where T : unmanaged + { + public T* Pointer { get; private set; } + public int Length { get; } + + public ref T this[int index] + { + get => ref Pointer[Checked(index)]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int Checked(int index) + { + if ((uint)index >= (uint)Length) + { + throw new IndexOutOfRangeException(); + } + + return index; + } + + public NativeArray(int length) + { + Pointer = (T*)Marshal.AllocHGlobal(checked(length * Unsafe.SizeOf())); + Length = length; + } + + public Span AsSpan() + { + return new Span(Pointer, Length); + } + + public void Dispose() + { + if (Pointer != null) + { + Marshal.FreeHGlobal((nint)Pointer); + Pointer = null; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PersistentFlushBuffer.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PersistentFlushBuffer.cs new file mode 100644 index 000000000..f9e2e1600 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PersistentFlushBuffer.cs @@ -0,0 +1,97 @@ +using Ryujinx.Graphics.GAL; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class PersistentFlushBuffer : IDisposable + { + private readonly VulkanRenderer _gd; + + private BufferHolder _flushStorage; + + public PersistentFlushBuffer(VulkanRenderer gd) + { + _gd = gd; + } + + private BufferHolder ResizeIfNeeded(int size) + { + var flushStorage = _flushStorage; + + if (flushStorage == null || size > _flushStorage.Size) + { + flushStorage?.Dispose(); + + flushStorage = _gd.BufferManager.Create(_gd, size); + _flushStorage = flushStorage; + } + + return flushStorage; + } + + public Span GetBufferData(CommandBufferPool cbp, BufferHolder buffer, int offset, int size) + { + var flushStorage = ResizeIfNeeded(size); + Auto srcBuffer; + + using (var cbs = cbp.Rent()) + { + srcBuffer = buffer.GetBuffer(cbs.CommandBuffer); + var dstBuffer = flushStorage.GetBuffer(cbs.CommandBuffer); + + if (srcBuffer.TryIncrementReferenceCount()) + { + BufferHolder.Copy(_gd, cbs, srcBuffer, dstBuffer, offset, 0, size, registerSrcUsage: false); + } + else + { + // Source buffer is no longer alive, don't copy anything to flush storage. + srcBuffer = null; + } + } + + flushStorage.WaitForFences(); + srcBuffer?.DecrementReferenceCount(); + return flushStorage.GetDataStorage(0, size); + } + + public Span GetTextureData(CommandBufferPool cbp, TextureView view, int size) + { + TextureCreateInfo info = view.Info; + + var flushStorage = ResizeIfNeeded(size); + + using (var cbs = cbp.Rent()) + { + var buffer = flushStorage.GetBuffer(cbs.CommandBuffer).Get(cbs).Value; + var image = view.GetImage().Get(cbs).Value; + + view.CopyFromOrToBuffer(cbs.CommandBuffer, buffer, image, size, true, 0, 0, info.GetLayers(), info.Levels, singleSlice: false); + } + + flushStorage.WaitForFences(); + return flushStorage.GetDataStorage(0, size); + } + + public Span GetTextureData(CommandBufferPool cbp, TextureView view, int size, int layer, int level) + { + var flushStorage = ResizeIfNeeded(size); + + using (var cbs = cbp.Rent()) + { + var buffer = flushStorage.GetBuffer(cbs.CommandBuffer).Get(cbs).Value; + var image = view.GetImage().Get(cbs).Value; + + view.CopyFromOrToBuffer(cbs.CommandBuffer, buffer, image, size, true, layer, level, 1, 1, singleSlice: true); + } + + flushStorage.WaitForFences(); + return flushStorage.GetDataStorage(0, size); + } + + public void Dispose() + { + _flushStorage.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineBase.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineBase.cs new file mode 100644 index 000000000..9aff2aeac --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineBase.cs @@ -0,0 +1,1810 @@ +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using CompareOp = Ryujinx.Graphics.GAL.CompareOp; +using Format = Ryujinx.Graphics.GAL.Format; +using FrontFace = Ryujinx.Graphics.GAL.FrontFace; +using IndexType = Ryujinx.Graphics.GAL.IndexType; +using PolygonMode = Ryujinx.Graphics.GAL.PolygonMode; +using PrimitiveTopology = Ryujinx.Graphics.GAL.PrimitiveTopology; +using Viewport = Ryujinx.Graphics.GAL.Viewport; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class PipelineBase : IDisposable + { + public const int DescriptorSetLayouts = 4; + + public const int UniformSetIndex = 0; + public const int StorageSetIndex = 1; + public const int TextureSetIndex = 2; + public const int ImageSetIndex = 3; + + protected readonly VulkanRenderer Gd; + protected readonly Device Device; + public readonly PipelineCache PipelineCache; + + public readonly AutoFlushCounter AutoFlush; + public readonly Action EndRenderPassDelegate; + + protected PipelineDynamicState DynamicState; + protected bool IsMainPipeline; + private PipelineState _newState; + private bool _graphicsStateDirty; + private bool _computeStateDirty; + private bool _bindingBarriersDirty; + private PrimitiveTopology _topology; + + private ulong _currentPipelineHandle; + + protected Auto Pipeline; + + protected PipelineBindPoint Pbp; + + protected CommandBufferScoped Cbs; + protected CommandBufferScoped? PreloadCbs; + protected CommandBuffer CommandBuffer; + + public CommandBufferScoped CurrentCommandBuffer => Cbs; + + private ShaderCollection _program; + + protected FramebufferParams FramebufferParams; + private Auto _framebuffer; + private RenderPassHolder _rpHolder; + private Auto _renderPass; + private RenderPassHolder _nullRenderPass; + private int _writtenAttachmentCount; + + private bool _framebufferUsingColorWriteMask; + + private ITexture[] _preMaskColors; + private ITexture _preMaskDepthStencil; + + private readonly DescriptorSetUpdater _descriptorSetUpdater; + + private IndexBufferState _indexBuffer; + private IndexBufferPattern _indexBufferPattern; + private readonly BufferState[] _transformFeedbackBuffers; + private readonly VertexBufferState[] _vertexBuffers; + private ulong _vertexBuffersDirty; + protected Rectangle ClearScissor; + + private readonly VertexBufferUpdater _vertexBufferUpdater; + + public IndexBufferPattern QuadsToTrisPattern; + public IndexBufferPattern TriFanToTrisPattern; + + private bool _needsIndexBufferRebind; + private bool _needsTransformFeedbackBuffersRebind; + + private bool _tfEnabled; + private bool _tfActive; + + private FeedbackLoopAspects _feedbackLoop; + private bool _passWritesDepthStencil; + + private readonly PipelineColorBlendAttachmentState[] _storedBlend; + public ulong DrawCount { get; private set; } + public bool RenderPassActive { get; private set; } + + public unsafe PipelineBase(VulkanRenderer gd, Device device) + { + Gd = gd; + Device = device; + + AutoFlush = new AutoFlushCounter(gd); + EndRenderPassDelegate = EndRenderPass; + + var pipelineCacheCreateInfo = new PipelineCacheCreateInfo + { + SType = StructureType.PipelineCacheCreateInfo, + }; + + gd.Api.CreatePipelineCache(device, in pipelineCacheCreateInfo, null, out PipelineCache).ThrowOnError(); + + _descriptorSetUpdater = new DescriptorSetUpdater(gd, device); + _vertexBufferUpdater = new VertexBufferUpdater(gd); + + _transformFeedbackBuffers = new BufferState[Constants.MaxTransformFeedbackBuffers]; + _vertexBuffers = new VertexBufferState[Constants.MaxVertexBuffers + 1]; + + const int EmptyVbSize = 16; + + using var emptyVb = gd.BufferManager.Create(gd, EmptyVbSize); + emptyVb.SetData(0, new byte[EmptyVbSize]); + _vertexBuffers[0] = new VertexBufferState(emptyVb.GetBuffer(), 0, 0, EmptyVbSize); + _vertexBuffersDirty = ulong.MaxValue >> (64 - _vertexBuffers.Length); + + ClearScissor = new Rectangle(0, 0, 0xffff, 0xffff); + + _storedBlend = new PipelineColorBlendAttachmentState[Constants.MaxRenderTargets]; + + _newState.Initialize(); + } + + public void Initialize() + { + _descriptorSetUpdater.Initialize(IsMainPipeline); + + QuadsToTrisPattern = new IndexBufferPattern(Gd, 4, 6, 0, new[] { 0, 1, 2, 0, 2, 3 }, 4, false); + TriFanToTrisPattern = new IndexBufferPattern(Gd, 3, 3, 2, new[] { int.MinValue, -1, 0 }, 1, true); + } + + public unsafe void Barrier() + { + Gd.Barriers.QueueMemoryBarrier(); + } + + public void ComputeBarrier() + { + MemoryBarrier memoryBarrier = new() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = AccessFlags.MemoryReadBit | AccessFlags.MemoryWriteBit, + DstAccessMask = AccessFlags.MemoryReadBit | AccessFlags.MemoryWriteBit, + }; + + Gd.Api.CmdPipelineBarrier( + CommandBuffer, + PipelineStageFlags.ComputeShaderBit, + PipelineStageFlags.AllCommandsBit, + 0, + 1, + new ReadOnlySpan(in memoryBarrier), + 0, + ReadOnlySpan.Empty, + 0, + ReadOnlySpan.Empty); + } + + public void BeginTransformFeedback(PrimitiveTopology topology) + { + Gd.Barriers.EnableTfbBarriers(true); + _tfEnabled = true; + } + + public void ClearBuffer(BufferHandle destination, int offset, int size, uint value) + { + EndRenderPass(); + + var dst = Gd.BufferManager.GetBuffer(CommandBuffer, destination, offset, size, true).Get(Cbs, offset, size, true).Value; + + BufferHolder.InsertBufferBarrier( + Gd, + Cbs.CommandBuffer, + dst, + BufferHolder.DefaultAccessFlags, + AccessFlags.TransferWriteBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + offset, + size); + + Gd.Api.CmdFillBuffer(CommandBuffer, dst, (ulong)offset, (ulong)size, value); + + BufferHolder.InsertBufferBarrier( + Gd, + Cbs.CommandBuffer, + dst, + AccessFlags.TransferWriteBit, + BufferHolder.DefaultAccessFlags, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + offset, + size); + } + + public unsafe void ClearRenderTargetColor(int index, int layer, int layerCount, ColorF color) + { + if (FramebufferParams == null || !FramebufferParams.IsValidColorAttachment(index)) + { + return; + } + + if (_renderPass == null) + { + CreateRenderPass(); + } + + Gd.Barriers.Flush(Cbs, RenderPassActive, _rpHolder, EndRenderPassDelegate); + + BeginRenderPass(); + + var clearValue = new ClearValue(new ClearColorValue(color.Red, color.Green, color.Blue, color.Alpha)); + var attachment = new ClearAttachment(ImageAspectFlags.ColorBit, (uint)index, clearValue); + var clearRect = FramebufferParams.GetClearRect(ClearScissor, layer, layerCount); + + Gd.Api.CmdClearAttachments(CommandBuffer, 1, &attachment, 1, &clearRect); + } + + public unsafe void ClearRenderTargetDepthStencil(int layer, int layerCount, float depthValue, bool depthMask, int stencilValue, bool stencilMask) + { + if (FramebufferParams == null || !FramebufferParams.HasDepthStencil) + { + return; + } + + var clearValue = new ClearValue(null, new ClearDepthStencilValue(depthValue, (uint)stencilValue)); + var flags = depthMask ? ImageAspectFlags.DepthBit : 0; + + if (stencilMask) + { + flags |= ImageAspectFlags.StencilBit; + } + + flags &= FramebufferParams.GetDepthStencilAspectFlags(); + + if (flags == ImageAspectFlags.None) + { + return; + } + + if (_renderPass == null) + { + CreateRenderPass(); + } + + Gd.Barriers.Flush(Cbs, RenderPassActive, _rpHolder, EndRenderPassDelegate); + + BeginRenderPass(); + + var attachment = new ClearAttachment(flags, 0, clearValue); + var clearRect = FramebufferParams.GetClearRect(ClearScissor, layer, layerCount); + + Gd.Api.CmdClearAttachments(CommandBuffer, 1, &attachment, 1, &clearRect); + } + + public unsafe void CommandBufferBarrier() + { + Gd.Barriers.QueueCommandBufferBarrier(); + } + + public void CopyBuffer(BufferHandle source, BufferHandle destination, int srcOffset, int dstOffset, int size) + { + EndRenderPass(); + + var src = Gd.BufferManager.GetBuffer(CommandBuffer, source, srcOffset, size, false); + var dst = Gd.BufferManager.GetBuffer(CommandBuffer, destination, dstOffset, size, true); + + BufferHolder.Copy(Gd, Cbs, src, dst, srcOffset, dstOffset, size); + } + + public void DirtyVertexBuffer(Auto buffer) + { + for (int i = 0; i < _vertexBuffers.Length; i++) + { + if (_vertexBuffers[i].BoundEquals(buffer)) + { + _vertexBuffersDirty |= 1UL << i; + } + } + } + + public void DirtyIndexBuffer(Auto buffer) + { + if (_indexBuffer.BoundEquals(buffer)) + { + _needsIndexBufferRebind = true; + } + } + + public void DispatchCompute(int groupsX, int groupsY, int groupsZ) + { + if (!_program.IsLinked) + { + return; + } + + EndRenderPass(); + RecreateComputePipelineIfNeeded(); + + Gd.Api.CmdDispatch(CommandBuffer, (uint)groupsX, (uint)groupsY, (uint)groupsZ); + } + + public void DispatchComputeIndirect(Auto indirectBuffer, int indirectBufferOffset) + { + if (!_program.IsLinked) + { + return; + } + + EndRenderPass(); + RecreateComputePipelineIfNeeded(); + + Gd.Api.CmdDispatchIndirect(CommandBuffer, indirectBuffer.Get(Cbs, indirectBufferOffset, 12).Value, (ulong)indirectBufferOffset); + } + + public void Draw(int vertexCount, int instanceCount, int firstVertex, int firstInstance) + { + if (vertexCount == 0) + { + return; + } + + if (!RecreateGraphicsPipelineIfNeeded()) + { + return; + } + + BeginRenderPass(); + DrawCount++; + + if (Gd.TopologyUnsupported(_topology)) + { + // Temporarily bind a conversion pattern as an index buffer. + _needsIndexBufferRebind = true; + + IndexBufferPattern pattern = _topology switch + { + PrimitiveTopology.Quads => QuadsToTrisPattern, + PrimitiveTopology.TriangleFan or + PrimitiveTopology.Polygon => TriFanToTrisPattern, + _ => throw new NotSupportedException($"Unsupported topology: {_topology}"), + }; + + BufferHandle handle = pattern.GetRepeatingBuffer(vertexCount, out int indexCount); + var buffer = Gd.BufferManager.GetBuffer(CommandBuffer, handle, false); + + Gd.Api.CmdBindIndexBuffer(CommandBuffer, buffer.Get(Cbs, 0, indexCount * sizeof(int)).Value, 0, Silk.NET.Vulkan.IndexType.Uint32); + + BeginRenderPass(); // May have been interrupted to set buffer data. + ResumeTransformFeedbackInternal(); + + Gd.Api.CmdDrawIndexed(CommandBuffer, (uint)indexCount, (uint)instanceCount, 0, firstVertex, (uint)firstInstance); + } + else + { + ResumeTransformFeedbackInternal(); + + Gd.Api.CmdDraw(CommandBuffer, (uint)vertexCount, (uint)instanceCount, (uint)firstVertex, (uint)firstInstance); + } + } + + private void UpdateIndexBufferPattern() + { + IndexBufferPattern pattern = null; + + if (Gd.TopologyUnsupported(_topology)) + { + pattern = _topology switch + { + PrimitiveTopology.Quads => QuadsToTrisPattern, + PrimitiveTopology.TriangleFan or + PrimitiveTopology.Polygon => TriFanToTrisPattern, + _ => throw new NotSupportedException($"Unsupported topology: {_topology}"), + }; + } + + if (_indexBufferPattern != pattern) + { + _indexBufferPattern = pattern; + _needsIndexBufferRebind = true; + } + } + + public void DrawIndexed(int indexCount, int instanceCount, int firstIndex, int firstVertex, int firstInstance) + { + if (indexCount == 0) + { + return; + } + + UpdateIndexBufferPattern(); + + if (!RecreateGraphicsPipelineIfNeeded()) + { + return; + } + + BeginRenderPass(); + DrawCount++; + + if (_indexBufferPattern != null) + { + // Convert the index buffer into a supported topology. + IndexBufferPattern pattern = _indexBufferPattern; + + int convertedCount = pattern.GetConvertedCount(indexCount); + + if (_needsIndexBufferRebind) + { + _indexBuffer.BindConvertedIndexBuffer(Gd, Cbs, firstIndex, indexCount, convertedCount, pattern); + + _needsIndexBufferRebind = false; + } + + BeginRenderPass(); // May have been interrupted to set buffer data. + ResumeTransformFeedbackInternal(); + + Gd.Api.CmdDrawIndexed(CommandBuffer, (uint)convertedCount, (uint)instanceCount, 0, firstVertex, (uint)firstInstance); + } + else + { + ResumeTransformFeedbackInternal(); + + Gd.Api.CmdDrawIndexed(CommandBuffer, (uint)indexCount, (uint)instanceCount, (uint)firstIndex, firstVertex, (uint)firstInstance); + } + } + + public void DrawIndexedIndirect(BufferRange indirectBuffer) + { + var buffer = Gd.BufferManager + .GetBuffer(CommandBuffer, indirectBuffer.Handle, indirectBuffer.Offset, indirectBuffer.Size, false) + .Get(Cbs, indirectBuffer.Offset, indirectBuffer.Size).Value; + + UpdateIndexBufferPattern(); + + if (!RecreateGraphicsPipelineIfNeeded()) + { + return; + } + + BeginRenderPass(); + DrawCount++; + + if (_indexBufferPattern != null) + { + // Convert the index buffer into a supported topology. + IndexBufferPattern pattern = _indexBufferPattern; + + Auto indirectBufferAuto = _indexBuffer.BindConvertedIndexBufferIndirect( + Gd, + Cbs, + indirectBuffer, + BufferRange.Empty, + pattern, + false, + 1, + indirectBuffer.Size); + + _needsIndexBufferRebind = false; + + BeginRenderPass(); // May have been interrupted to set buffer data. + ResumeTransformFeedbackInternal(); + + Gd.Api.CmdDrawIndexedIndirect(CommandBuffer, indirectBufferAuto.Get(Cbs, 0, indirectBuffer.Size).Value, 0, 1, (uint)indirectBuffer.Size); + } + else + { + ResumeTransformFeedbackInternal(); + + Gd.Api.CmdDrawIndexedIndirect(CommandBuffer, buffer, (ulong)indirectBuffer.Offset, 1, (uint)indirectBuffer.Size); + } + } + + public void DrawIndexedIndirectCount(BufferRange indirectBuffer, BufferRange parameterBuffer, int maxDrawCount, int stride) + { + var countBuffer = Gd.BufferManager + .GetBuffer(CommandBuffer, parameterBuffer.Handle, parameterBuffer.Offset, parameterBuffer.Size, false) + .Get(Cbs, parameterBuffer.Offset, parameterBuffer.Size).Value; + + var buffer = Gd.BufferManager + .GetBuffer(CommandBuffer, indirectBuffer.Handle, indirectBuffer.Offset, indirectBuffer.Size, false) + .Get(Cbs, indirectBuffer.Offset, indirectBuffer.Size).Value; + + UpdateIndexBufferPattern(); + + if (!RecreateGraphicsPipelineIfNeeded()) + { + return; + } + + BeginRenderPass(); + DrawCount++; + + if (_indexBufferPattern != null) + { + // Convert the index buffer into a supported topology. + IndexBufferPattern pattern = _indexBufferPattern; + + Auto indirectBufferAuto = _indexBuffer.BindConvertedIndexBufferIndirect( + Gd, + Cbs, + indirectBuffer, + parameterBuffer, + pattern, + true, + maxDrawCount, + stride); + + _needsIndexBufferRebind = false; + + BeginRenderPass(); // May have been interrupted to set buffer data. + ResumeTransformFeedbackInternal(); + + if (Gd.Capabilities.SupportsIndirectParameters) + { + Gd.DrawIndirectCountApi.CmdDrawIndexedIndirectCount( + CommandBuffer, + indirectBufferAuto.Get(Cbs, 0, indirectBuffer.Size).Value, + 0, + countBuffer, + (ulong)parameterBuffer.Offset, + (uint)maxDrawCount, + (uint)stride); + } + else + { + // This is also fine because the indirect data conversion always zeros + // the entries that are past the current draw count. + + Gd.Api.CmdDrawIndexedIndirect( + CommandBuffer, + indirectBufferAuto.Get(Cbs, 0, indirectBuffer.Size).Value, + 0, + (uint)maxDrawCount, + (uint)stride); + } + } + else + { + ResumeTransformFeedbackInternal(); + + if (Gd.Capabilities.SupportsIndirectParameters) + { + Gd.DrawIndirectCountApi.CmdDrawIndexedIndirectCount( + CommandBuffer, + buffer, + (ulong)indirectBuffer.Offset, + countBuffer, + (ulong)parameterBuffer.Offset, + (uint)maxDrawCount, + (uint)stride); + } + else + { + // Not fully correct, but we can't do much better if the host does not support indirect count. + Gd.Api.CmdDrawIndexedIndirect( + CommandBuffer, + buffer, + (ulong)indirectBuffer.Offset, + (uint)maxDrawCount, + (uint)stride); + } + } + } + + public void DrawIndirect(BufferRange indirectBuffer) + { + // TODO: Support quads and other unsupported topologies. + + var buffer = Gd.BufferManager + .GetBuffer(CommandBuffer, indirectBuffer.Handle, indirectBuffer.Offset, indirectBuffer.Size, false) + .Get(Cbs, indirectBuffer.Offset, indirectBuffer.Size, false).Value; + + if (!RecreateGraphicsPipelineIfNeeded()) + { + return; + } + + BeginRenderPass(); + ResumeTransformFeedbackInternal(); + DrawCount++; + + Gd.Api.CmdDrawIndirect(CommandBuffer, buffer, (ulong)indirectBuffer.Offset, 1, (uint)indirectBuffer.Size); + } + + public void DrawIndirectCount(BufferRange indirectBuffer, BufferRange parameterBuffer, int maxDrawCount, int stride) + { + if (!Gd.Capabilities.SupportsIndirectParameters) + { + // TODO: Fallback for when this is not supported. + throw new NotSupportedException(); + } + + var buffer = Gd.BufferManager + .GetBuffer(CommandBuffer, indirectBuffer.Handle, indirectBuffer.Offset, indirectBuffer.Size, false) + .Get(Cbs, indirectBuffer.Offset, indirectBuffer.Size, false).Value; + + var countBuffer = Gd.BufferManager + .GetBuffer(CommandBuffer, parameterBuffer.Handle, parameterBuffer.Offset, parameterBuffer.Size, false) + .Get(Cbs, parameterBuffer.Offset, parameterBuffer.Size, false).Value; + + // TODO: Support quads and other unsupported topologies. + + if (!RecreateGraphicsPipelineIfNeeded()) + { + return; + } + + BeginRenderPass(); + ResumeTransformFeedbackInternal(); + DrawCount++; + + Gd.DrawIndirectCountApi.CmdDrawIndirectCount( + CommandBuffer, + buffer, + (ulong)indirectBuffer.Offset, + countBuffer, + (ulong)parameterBuffer.Offset, + (uint)maxDrawCount, + (uint)stride); + } + + public void DrawTexture(ITexture texture, ISampler sampler, Extents2DF srcRegion, Extents2DF dstRegion) + { + if (texture is TextureView srcTexture) + { + var oldCullMode = _newState.CullMode; + var oldStencilTestEnable = _newState.StencilTestEnable; + var oldDepthTestEnable = _newState.DepthTestEnable; + var oldDepthWriteEnable = _newState.DepthWriteEnable; + var oldViewports = DynamicState.Viewports; + var oldViewportsCount = _newState.ViewportsCount; + var oldTopology = _topology; + + _newState.CullMode = CullModeFlags.None; + _newState.StencilTestEnable = false; + _newState.DepthTestEnable = false; + _newState.DepthWriteEnable = false; + SignalStateChange(); + + Gd.HelperShader.DrawTexture( + Gd, + this, + srcTexture, + sampler, + srcRegion, + dstRegion); + + _newState.CullMode = oldCullMode; + _newState.StencilTestEnable = oldStencilTestEnable; + _newState.DepthTestEnable = oldDepthTestEnable; + _newState.DepthWriteEnable = oldDepthWriteEnable; + SetPrimitiveTopology(oldTopology); + + DynamicState.SetViewports(ref oldViewports, oldViewportsCount); + + _newState.ViewportsCount = oldViewportsCount; + SignalStateChange(); + } + } + + public void EndTransformFeedback() + { + Gd.Barriers.EnableTfbBarriers(false); + PauseTransformFeedbackInternal(); + _tfEnabled = false; + } + + public bool IsCommandBufferActive(CommandBuffer cb) + { + return CommandBuffer.Handle == cb.Handle; + } + + internal void Rebind(Auto buffer, int offset, int size) + { + _descriptorSetUpdater.Rebind(buffer, offset, size); + + if (_indexBuffer.Overlaps(buffer, offset, size)) + { + _indexBuffer.BindIndexBuffer(Gd, Cbs); + } + + for (int i = 0; i < _vertexBuffers.Length; i++) + { + if (_vertexBuffers[i].Overlaps(buffer, offset, size)) + { + _vertexBuffers[i].BindVertexBuffer(Gd, Cbs, (uint)i, ref _newState, _vertexBufferUpdater); + } + } + + _vertexBufferUpdater.Commit(Cbs); + } + + public void SetAlphaTest(bool enable, float reference, CompareOp op) + { + // This is currently handled using shader specialization, as Vulkan does not support alpha test. + // In the future, we may want to use this to write the reference value into the support buffer, + // to avoid creating one version of the shader per reference value used. + } + + public void SetBlendState(AdvancedBlendDescriptor blend) + { + for (int index = 0; index < Constants.MaxRenderTargets; index++) + { + ref var vkBlend = ref _newState.Internal.ColorBlendAttachmentState[index]; + + if (index == 0) + { + var blendOp = blend.Op.Convert(); + + vkBlend = new PipelineColorBlendAttachmentState( + blendEnable: true, + colorBlendOp: blendOp, + alphaBlendOp: blendOp, + colorWriteMask: vkBlend.ColorWriteMask); + + if (Gd.Capabilities.SupportsBlendEquationAdvancedNonPreMultipliedSrcColor) + { + _newState.AdvancedBlendSrcPreMultiplied = blend.SrcPreMultiplied; + } + + if (Gd.Capabilities.SupportsBlendEquationAdvancedCorrelatedOverlap) + { + _newState.AdvancedBlendOverlap = blend.Overlap.Convert(); + } + } + else + { + vkBlend = new PipelineColorBlendAttachmentState( + colorWriteMask: vkBlend.ColorWriteMask); + } + + if (vkBlend.ColorWriteMask == 0) + { + _storedBlend[index] = vkBlend; + + vkBlend = new PipelineColorBlendAttachmentState(); + } + } + + SignalStateChange(); + } + + public void SetBlendState(int index, BlendDescriptor blend) + { + ref var vkBlend = ref _newState.Internal.ColorBlendAttachmentState[index]; + + if (blend.Enable) + { + vkBlend.BlendEnable = blend.Enable; + vkBlend.SrcColorBlendFactor = blend.ColorSrcFactor.Convert(); + vkBlend.DstColorBlendFactor = blend.ColorDstFactor.Convert(); + vkBlend.ColorBlendOp = blend.ColorOp.Convert(); + vkBlend.SrcAlphaBlendFactor = blend.AlphaSrcFactor.Convert(); + vkBlend.DstAlphaBlendFactor = blend.AlphaDstFactor.Convert(); + vkBlend.AlphaBlendOp = blend.AlphaOp.Convert(); + } + else + { + vkBlend = new PipelineColorBlendAttachmentState( + colorWriteMask: vkBlend.ColorWriteMask); + } + + if (vkBlend.ColorWriteMask == 0) + { + _storedBlend[index] = vkBlend; + + vkBlend = new PipelineColorBlendAttachmentState(); + } + + DynamicState.SetBlendConstants( + blend.BlendConstant.Red, + blend.BlendConstant.Green, + blend.BlendConstant.Blue, + blend.BlendConstant.Alpha); + + // Reset advanced blend state back defaults to the cache to help the pipeline cache. + _newState.AdvancedBlendSrcPreMultiplied = true; + _newState.AdvancedBlendDstPreMultiplied = true; + _newState.AdvancedBlendOverlap = BlendOverlapEXT.UncorrelatedExt; + + SignalStateChange(); + } + + public void SetDepthBias(PolygonModeMask enables, float factor, float units, float clamp) + { + DynamicState.SetDepthBias(factor, units, clamp); + + _newState.DepthBiasEnable = enables != 0; + SignalStateChange(); + } + + public void SetDepthClamp(bool clamp) + { + _newState.DepthClampEnable = clamp; + SignalStateChange(); + } + + public void SetDepthMode(DepthMode mode) + { + bool oldMode = _newState.DepthMode; + _newState.DepthMode = mode == DepthMode.MinusOneToOne; + if (_newState.DepthMode != oldMode) + { + SignalStateChange(); + } + } + + public void SetDepthTest(DepthTestDescriptor depthTest) + { + _newState.DepthTestEnable = depthTest.TestEnable; + _newState.DepthWriteEnable = depthTest.WriteEnable; + _newState.DepthCompareOp = depthTest.Func.Convert(); + + UpdatePassDepthStencil(); + SignalStateChange(); + } + + public void SetFaceCulling(bool enable, Face face) + { + _newState.CullMode = enable ? face.Convert() : CullModeFlags.None; + SignalStateChange(); + } + + public void SetFrontFace(FrontFace frontFace) + { + _newState.FrontFace = frontFace.Convert(); + SignalStateChange(); + } + + public void SetImage(ShaderStage stage, int binding, ITexture image) + { + _descriptorSetUpdater.SetImage(Cbs, stage, binding, image); + } + + public void SetImage(int binding, Auto image) + { + _descriptorSetUpdater.SetImage(binding, image); + } + + public void SetImageArray(ShaderStage stage, int binding, IImageArray array) + { + _descriptorSetUpdater.SetImageArray(Cbs, stage, binding, array); + } + + public void SetImageArraySeparate(ShaderStage stage, int setIndex, IImageArray array) + { + _descriptorSetUpdater.SetImageArraySeparate(Cbs, stage, setIndex, array); + } + + public void SetIndexBuffer(BufferRange buffer, IndexType type) + { + if (buffer.Handle != BufferHandle.Null) + { + _indexBuffer = new IndexBufferState(buffer.Handle, buffer.Offset, buffer.Size, type.Convert()); + } + else + { + _indexBuffer = IndexBufferState.Null; + } + + _needsIndexBufferRebind = true; + } + + public void SetLineParameters(float width, bool smooth) + { + _newState.LineWidth = width; + SignalStateChange(); + } + + public void SetLogicOpState(bool enable, LogicalOp op) + { + _newState.LogicOpEnable = enable; + _newState.LogicOp = op.Convert(); + SignalStateChange(); + } + + public void SetMultisampleState(MultisampleDescriptor multisample) + { + _newState.AlphaToCoverageEnable = multisample.AlphaToCoverageEnable; + _newState.AlphaToOneEnable = multisample.AlphaToOneEnable; + SignalStateChange(); + } + + public void SetPatchParameters(int vertices, ReadOnlySpan defaultOuterLevel, ReadOnlySpan defaultInnerLevel) + { + _newState.PatchControlPoints = (uint)vertices; + SignalStateChange(); + + // TODO: Default levels (likely needs emulation on shaders?) + } + + public void SetPointParameters(float size, bool isProgramPointSize, bool enablePointSprite, Origin origin) + { + // TODO. + } + + public void SetPolygonMode(PolygonMode frontMode, PolygonMode backMode) + { + // TODO. + } + + public void SetPrimitiveRestart(bool enable, int index) + { + _newState.PrimitiveRestartEnable = enable; + // TODO: What to do about the index? + SignalStateChange(); + } + + public void SetPrimitiveTopology(PrimitiveTopology topology) + { + _topology = topology; + + var vkTopology = Gd.TopologyRemap(topology).Convert(); + + _newState.Topology = vkTopology; + + SignalStateChange(); + } + + public void SetProgram(IProgram program) + { + var internalProgram = (ShaderCollection)program; + var stages = internalProgram.GetInfos(); + + _program = internalProgram; + + _descriptorSetUpdater.SetProgram(Cbs, internalProgram, _currentPipelineHandle != 0); + _bindingBarriersDirty = true; + + _newState.PipelineLayout = internalProgram.PipelineLayout; + _newState.HasTessellationControlShader = internalProgram.HasTessellationControlShader; + _newState.StagesCount = (uint)stages.Length; + + stages.CopyTo(_newState.Stages.AsSpan()[..stages.Length]); + + SignalStateChange(); + + if (internalProgram.IsCompute) + { + EndRenderPass(); + } + } + + public void Specialize(in T data) where T : unmanaged + { + var dataSpan = MemoryMarshal.AsBytes(MemoryMarshal.CreateReadOnlySpan(ref Unsafe.AsRef(in data), 1)); + + if (!dataSpan.SequenceEqual(_newState.SpecializationData.Span)) + { + _newState.SpecializationData = new SpecData(dataSpan); + + SignalStateChange(); + } + } + + protected virtual void SignalAttachmentChange() + { + } + + public void SetRasterizerDiscard(bool discard) + { + _newState.RasterizerDiscardEnable = discard; + SignalStateChange(); + + if (!discard && Gd.IsQualcommProprietary) + { + // On Adreno, enabling rasterizer discard somehow corrupts the viewport state. + // Force it to be updated on next use to work around this bug. + DynamicState.ForceAllDirty(); + } + } + + public void SetRenderTargetColorMasks(ReadOnlySpan componentMask) + { + int count = Math.Min(Constants.MaxRenderTargets, componentMask.Length); + int writtenAttachments = 0; + + for (int i = 0; i < count; i++) + { + ref var vkBlend = ref _newState.Internal.ColorBlendAttachmentState[i]; + var newMask = (ColorComponentFlags)componentMask[i]; + + // When color write mask is 0, remove all blend state to help the pipeline cache. + // Restore it when the mask becomes non-zero. + if (vkBlend.ColorWriteMask != newMask) + { + if (newMask == 0) + { + _storedBlend[i] = vkBlend; + + vkBlend = new PipelineColorBlendAttachmentState(); + } + else if (vkBlend.ColorWriteMask == 0) + { + vkBlend = _storedBlend[i]; + } + } + + vkBlend.ColorWriteMask = newMask; + + if (componentMask[i] != 0) + { + writtenAttachments++; + } + } + + if (_framebufferUsingColorWriteMask) + { + SetRenderTargetsInternal(_preMaskColors, _preMaskDepthStencil, true); + } + else + { + SignalStateChange(); + + if (writtenAttachments != _writtenAttachmentCount) + { + SignalAttachmentChange(); + _writtenAttachmentCount = writtenAttachments; + } + } + } + + private void SetRenderTargetsInternal(ITexture[] colors, ITexture depthStencil, bool filterWriteMasked) + { + CreateFramebuffer(colors, depthStencil, filterWriteMasked); + CreateRenderPass(); + SignalStateChange(); + SignalAttachmentChange(); + } + + public void SetRenderTargets(ITexture[] colors, ITexture depthStencil) + { + _framebufferUsingColorWriteMask = false; + SetRenderTargetsInternal(colors, depthStencil, Gd.IsTBDR); + } + + public void SetScissors(ReadOnlySpan> regions) + { + int maxScissors = Gd.Capabilities.SupportsMultiView ? Constants.MaxViewports : 1; + int count = Math.Min(maxScissors, regions.Length); + if (count > 0) + { + ClearScissor = regions[0]; + } + + for (int i = 0; i < count; i++) + { + var region = regions[i]; + var offset = new Offset2D(region.X, region.Y); + var extent = new Extent2D((uint)region.Width, (uint)region.Height); + + DynamicState.SetScissor(i, new Rect2D(offset, extent)); + } + + DynamicState.ScissorsCount = count; + + _newState.ScissorsCount = (uint)count; + SignalStateChange(); + } + + public void SetStencilTest(StencilTestDescriptor stencilTest) + { + DynamicState.SetStencilMasks( + (uint)stencilTest.BackFuncMask, + (uint)stencilTest.BackMask, + (uint)stencilTest.BackFuncRef, + (uint)stencilTest.FrontFuncMask, + (uint)stencilTest.FrontMask, + (uint)stencilTest.FrontFuncRef); + + _newState.StencilTestEnable = stencilTest.TestEnable; + _newState.StencilBackFailOp = stencilTest.BackSFail.Convert(); + _newState.StencilBackPassOp = stencilTest.BackDpPass.Convert(); + _newState.StencilBackDepthFailOp = stencilTest.BackDpFail.Convert(); + _newState.StencilBackCompareOp = stencilTest.BackFunc.Convert(); + _newState.StencilFrontFailOp = stencilTest.FrontSFail.Convert(); + _newState.StencilFrontPassOp = stencilTest.FrontDpPass.Convert(); + _newState.StencilFrontDepthFailOp = stencilTest.FrontDpFail.Convert(); + _newState.StencilFrontCompareOp = stencilTest.FrontFunc.Convert(); + + UpdatePassDepthStencil(); + SignalStateChange(); + } + + public void SetStorageBuffers(ReadOnlySpan buffers) + { + _descriptorSetUpdater.SetStorageBuffers(CommandBuffer, buffers); + } + + public void SetStorageBuffers(int first, ReadOnlySpan> buffers) + { + _descriptorSetUpdater.SetStorageBuffers(CommandBuffer, first, buffers); + } + + public void SetTextureAndSampler(ShaderStage stage, int binding, ITexture texture, ISampler sampler) + { + _descriptorSetUpdater.SetTextureAndSampler(Cbs, stage, binding, texture, sampler); + } + + public void SetTextureAndSamplerIdentitySwizzle(ShaderStage stage, int binding, ITexture texture, ISampler sampler) + { + _descriptorSetUpdater.SetTextureAndSamplerIdentitySwizzle(Cbs, stage, binding, texture, sampler); + } + + public void SetTextureArray(ShaderStage stage, int binding, ITextureArray array) + { + _descriptorSetUpdater.SetTextureArray(Cbs, stage, binding, array); + } + + public void SetTextureArraySeparate(ShaderStage stage, int setIndex, ITextureArray array) + { + _descriptorSetUpdater.SetTextureArraySeparate(Cbs, stage, setIndex, array); + } + + public void SetTransformFeedbackBuffers(ReadOnlySpan buffers) + { + PauseTransformFeedbackInternal(); + + int count = Math.Min(Constants.MaxTransformFeedbackBuffers, buffers.Length); + + for (int i = 0; i < count; i++) + { + var range = buffers[i]; + + _transformFeedbackBuffers[i].Dispose(); + + if (range.Handle != BufferHandle.Null) + { + _transformFeedbackBuffers[i] = + new BufferState(Gd.BufferManager.GetBuffer(CommandBuffer, range.Handle, range.Offset, range.Size, true), range.Offset, range.Size); + _transformFeedbackBuffers[i].BindTransformFeedbackBuffer(Gd, Cbs, (uint)i); + } + else + { + _transformFeedbackBuffers[i] = BufferState.Null; + } + } + } + + public void SetUniformBuffers(ReadOnlySpan buffers) + { + _descriptorSetUpdater.SetUniformBuffers(CommandBuffer, buffers); + } + + public void SetUserClipDistance(int index, bool enableClip) + { + // TODO. + } + + public void SetVertexAttribs(ReadOnlySpan vertexAttribs) + { + var formatCapabilities = Gd.FormatCapabilities; + + Span newVbScalarSizes = stackalloc int[Constants.MaxVertexBuffers]; + + int count = Math.Min(Constants.MaxVertexAttributes, vertexAttribs.Length); + uint dirtyVbSizes = 0; + + for (int i = 0; i < count; i++) + { + var attribute = vertexAttribs[i]; + var rawIndex = attribute.BufferIndex; + var bufferIndex = attribute.IsZero ? 0 : rawIndex + 1; + + if (!attribute.IsZero) + { + newVbScalarSizes[rawIndex] = Math.Max(newVbScalarSizes[rawIndex], attribute.Format.GetScalarSize()); + dirtyVbSizes |= 1u << rawIndex; + } + + _newState.Internal.VertexAttributeDescriptions[i] = new VertexInputAttributeDescription( + (uint)i, + (uint)bufferIndex, + formatCapabilities.ConvertToVertexVkFormat(attribute.Format), + (uint)attribute.Offset); + } + + while (dirtyVbSizes != 0) + { + int dirtyBit = BitOperations.TrailingZeroCount(dirtyVbSizes); + + ref var buffer = ref _vertexBuffers[dirtyBit + 1]; + + if (buffer.AttributeScalarAlignment != newVbScalarSizes[dirtyBit]) + { + _vertexBuffersDirty |= 1UL << (dirtyBit + 1); + buffer.AttributeScalarAlignment = newVbScalarSizes[dirtyBit]; + } + + dirtyVbSizes &= ~(1u << dirtyBit); + } + + _newState.VertexAttributeDescriptionsCount = (uint)count; + SignalStateChange(); + } + + public void SetVertexBuffers(ReadOnlySpan vertexBuffers) + { + int count = Math.Min(Constants.MaxVertexBuffers, vertexBuffers.Length); + + _newState.Internal.VertexBindingDescriptions[0] = new VertexInputBindingDescription(0, 0, VertexInputRate.Vertex); + + int validCount = 1; + + BufferHandle lastHandle = default; + Auto lastBuffer = default; + + for (int i = 0; i < count; i++) + { + var vertexBuffer = vertexBuffers[i]; + + // TODO: Support divisor > 1 + var inputRate = vertexBuffer.Divisor != 0 ? VertexInputRate.Instance : VertexInputRate.Vertex; + + if (vertexBuffer.Buffer.Handle != BufferHandle.Null) + { + Auto vb = (vertexBuffer.Buffer.Handle == lastHandle) ? lastBuffer : + Gd.BufferManager.GetBuffer(CommandBuffer, vertexBuffer.Buffer.Handle, false); + + lastHandle = vertexBuffer.Buffer.Handle; + lastBuffer = vb; + + if (vb != null) + { + int binding = i + 1; + int descriptorIndex = validCount++; + + _newState.Internal.VertexBindingDescriptions[descriptorIndex] = new VertexInputBindingDescription( + (uint)binding, + (uint)vertexBuffer.Stride, + inputRate); + + int vbSize = vertexBuffer.Buffer.Size; + + if (Gd.Vendor == Vendor.Amd && !Gd.IsMoltenVk && vertexBuffer.Stride > 0) + { + // AMD has a bug where if offset + stride * count is greater than + // the size, then the last attribute will have the wrong value. + // As a workaround, simply use the full buffer size. + int remainder = vbSize % vertexBuffer.Stride; + if (remainder != 0) + { + vbSize += vertexBuffer.Stride - remainder; + } + } + + ref var buffer = ref _vertexBuffers[binding]; + int oldScalarAlign = buffer.AttributeScalarAlignment; + + if (Gd.Capabilities.VertexBufferAlignment < 2 && + (vertexBuffer.Stride % FormatExtensions.MaxBufferFormatScalarSize) == 0) + { + if (!buffer.Matches(vb, descriptorIndex, vertexBuffer.Buffer.Offset, vbSize, vertexBuffer.Stride)) + { + buffer.Dispose(); + + buffer = new VertexBufferState( + vb, + descriptorIndex, + vertexBuffer.Buffer.Offset, + vbSize, + vertexBuffer.Stride); + + buffer.BindVertexBuffer(Gd, Cbs, (uint)binding, ref _newState, _vertexBufferUpdater); + } + } + else + { + // May need to be rewritten. Bind this buffer before draw. + + buffer.Dispose(); + + buffer = new VertexBufferState( + vertexBuffer.Buffer.Handle, + descriptorIndex, + vertexBuffer.Buffer.Offset, + vbSize, + vertexBuffer.Stride); + + _vertexBuffersDirty |= 1UL << binding; + } + + buffer.AttributeScalarAlignment = oldScalarAlign; + } + } + } + + _vertexBufferUpdater.Commit(Cbs); + + _newState.VertexBindingDescriptionsCount = (uint)validCount; + SignalStateChange(); + } + + public void SetViewports(ReadOnlySpan viewports) + { + int maxViewports = Gd.Capabilities.SupportsMultiView ? Constants.MaxViewports : 1; + int count = Math.Min(maxViewports, viewports.Length); + + static float Clamp(float value) + { + return Math.Clamp(value, 0f, 1f); + } + + DynamicState.ViewportsCount = (uint)count; + + for (int i = 0; i < count; i++) + { + var viewport = viewports[i]; + + DynamicState.SetViewport(i, new Silk.NET.Vulkan.Viewport( + viewport.Region.X, + viewport.Region.Y, + viewport.Region.Width == 0f ? 1f : viewport.Region.Width, + viewport.Region.Height == 0f ? 1f : viewport.Region.Height, + Clamp(viewport.DepthNear), + Clamp(viewport.DepthFar))); + } + + _newState.ViewportsCount = (uint)count; + SignalStateChange(); + } + + public void SwapBuffer(Auto from, Auto to) + { + _indexBuffer.Swap(from, to); + + for (int i = 0; i < _vertexBuffers.Length; i++) + { + _vertexBuffers[i].Swap(from, to); + } + + for (int i = 0; i < _transformFeedbackBuffers.Length; i++) + { + _transformFeedbackBuffers[i].Swap(from, to); + } + + _descriptorSetUpdater.SwapBuffer(from, to); + + SignalCommandBufferChange(); + } + + public void ForceTextureDirty() + { + _descriptorSetUpdater.ForceTextureDirty(); + } + + public void ForceImageDirty() + { + _descriptorSetUpdater.ForceImageDirty(); + } + + public unsafe void TextureBarrier() + { + Gd.Barriers.QueueTextureBarrier(); + } + + public void TextureBarrierTiled() + { + TextureBarrier(); + } + + protected void SignalCommandBufferChange() + { + _needsIndexBufferRebind = true; + _needsTransformFeedbackBuffersRebind = true; + _vertexBuffersDirty = ulong.MaxValue >> (64 - _vertexBuffers.Length); + + _descriptorSetUpdater.SignalCommandBufferChange(); + DynamicState.ForceAllDirty(); + _currentPipelineHandle = 0; + } + + private void CreateFramebuffer(ITexture[] colors, ITexture depthStencil, bool filterWriteMasked) + { + if (filterWriteMasked) + { + // TBDR GPUs don't work properly if the same attachment is bound to multiple targets, + // due to each attachment being a copy of the real attachment, rather than a direct write. + + // Just try to remove duplicate attachments. + // Save a copy of the array to rebind when mask changes. + + void MaskOut() + { + if (!_framebufferUsingColorWriteMask) + { + _preMaskColors = colors.ToArray(); + _preMaskDepthStencil = depthStencil; + } + + // If true, then the framebuffer must be recreated when the mask changes. + _framebufferUsingColorWriteMask = true; + } + + // Look for textures that are masked out. + + for (int i = 0; i < colors.Length; i++) + { + if (colors[i] == null) + { + continue; + } + + ref var vkBlend = ref _newState.Internal.ColorBlendAttachmentState[i]; + + for (int j = 0; j < i; j++) + { + // Check each binding for a duplicate binding before it. + + if (colors[i] == colors[j]) + { + // Prefer the binding with no write mask. + ref var vkBlend2 = ref _newState.Internal.ColorBlendAttachmentState[j]; + if (vkBlend.ColorWriteMask == 0) + { + colors[i] = null; + MaskOut(); + } + else if (vkBlend2.ColorWriteMask == 0) + { + colors[j] = null; + MaskOut(); + } + } + } + } + } + + if (IsMainPipeline) + { + FramebufferParams?.ClearBindings(); + } + + FramebufferParams = new FramebufferParams(Device, colors, depthStencil); + + if (IsMainPipeline) + { + FramebufferParams.AddBindings(); + + _newState.FeedbackLoopAspects = FeedbackLoopAspects.None; + _bindingBarriersDirty = true; + } + + _passWritesDepthStencil = false; + UpdatePassDepthStencil(); + UpdatePipelineAttachmentFormats(); + } + + protected void UpdatePipelineAttachmentFormats() + { + var dstAttachmentFormats = _newState.Internal.AttachmentFormats.AsSpan(); + FramebufferParams.AttachmentFormats.CopyTo(dstAttachmentFormats); + _newState.Internal.AttachmentIntegerFormatMask = FramebufferParams.AttachmentIntegerFormatMask; + _newState.Internal.LogicOpsAllowed = FramebufferParams.LogicOpsAllowed; + + for (int i = FramebufferParams.AttachmentFormats.Length; i < dstAttachmentFormats.Length; i++) + { + dstAttachmentFormats[i] = 0; + } + + _newState.ColorBlendAttachmentStateCount = (uint)(FramebufferParams.MaxColorAttachmentIndex + 1); + _newState.HasDepthStencil = FramebufferParams.HasDepthStencil; + _newState.SamplesCount = FramebufferParams.AttachmentSamples.Length != 0 ? FramebufferParams.AttachmentSamples[0] : 1; + } + + protected unsafe void CreateRenderPass() + { + var hasFramebuffer = FramebufferParams != null; + + EndRenderPass(); + + if (!hasFramebuffer || FramebufferParams.AttachmentsCount == 0) + { + // Use the null framebuffer. + _nullRenderPass ??= new RenderPassHolder(Gd, Device, new RenderPassCacheKey(), FramebufferParams); + + _rpHolder = _nullRenderPass; + _renderPass = _nullRenderPass.GetRenderPass(); + _framebuffer = _nullRenderPass.GetFramebuffer(Gd, Cbs, FramebufferParams); + } + else + { + (_rpHolder, _framebuffer) = FramebufferParams.GetPassAndFramebuffer(Gd, Device, Cbs); + + _renderPass = _rpHolder.GetRenderPass(); + } + } + + protected void SignalStateChange() + { + _graphicsStateDirty = true; + _computeStateDirty = true; + } + + private void RecreateComputePipelineIfNeeded() + { + if (_computeStateDirty || Pbp != PipelineBindPoint.Compute) + { + CreatePipeline(PipelineBindPoint.Compute); + _computeStateDirty = false; + Pbp = PipelineBindPoint.Compute; + + if (_bindingBarriersDirty) + { + // Stale barriers may have been activated by switching program. Emit any that are relevant. + _descriptorSetUpdater.InsertBindingBarriers(Cbs); + + _bindingBarriersDirty = false; + } + } + + Gd.Barriers.Flush(Cbs, _program, _feedbackLoop != 0, RenderPassActive, _rpHolder, EndRenderPassDelegate); + + _descriptorSetUpdater.UpdateAndBindDescriptorSets(Cbs, PipelineBindPoint.Compute); + } + + private bool ChangeFeedbackLoop(FeedbackLoopAspects aspects) + { + if (_feedbackLoop != aspects) + { + if (Gd.Capabilities.SupportsDynamicAttachmentFeedbackLoop) + { + DynamicState.SetFeedbackLoop(aspects); + } + else + { + _newState.FeedbackLoopAspects = aspects; + } + + _feedbackLoop = aspects; + + return true; + } + + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool UpdateFeedbackLoop() + { + List hazards = _descriptorSetUpdater.FeedbackLoopHazards; + + if ((hazards?.Count ?? 0) > 0) + { + FeedbackLoopAspects aspects = 0; + + foreach (TextureView view in hazards) + { + // May need to enforce feedback loop layout here in the future. + // Though technically, it should always work with the general layout. + + if (view.Info.Format.IsDepthOrStencil()) + { + if (_passWritesDepthStencil) + { + // If depth/stencil isn't written in the pass, it doesn't count as a feedback loop. + + aspects |= FeedbackLoopAspects.Depth; + } + } + else + { + aspects |= FeedbackLoopAspects.Color; + } + } + + return ChangeFeedbackLoop(aspects); + } + else if (_feedbackLoop != 0) + { + return ChangeFeedbackLoop(FeedbackLoopAspects.None); + } + + return false; + } + + private void UpdatePassDepthStencil() + { + if (!RenderPassActive) + { + _passWritesDepthStencil = false; + } + + // Stencil test being enabled doesn't necessarily mean a write, but it's not critical to check. + _passWritesDepthStencil |= (_newState.DepthTestEnable && _newState.DepthWriteEnable) || _newState.StencilTestEnable; + } + + private bool RecreateGraphicsPipelineIfNeeded() + { + if (AutoFlush.ShouldFlushDraw(DrawCount)) + { + Gd.FlushAllCommands(); + } + + DynamicState.ReplayIfDirty(Gd, CommandBuffer); + + if (_needsIndexBufferRebind && _indexBufferPattern == null) + { + _indexBuffer.BindIndexBuffer(Gd, Cbs); + _needsIndexBufferRebind = false; + } + + if (_needsTransformFeedbackBuffersRebind) + { + PauseTransformFeedbackInternal(); + + for (int i = 0; i < Constants.MaxTransformFeedbackBuffers; i++) + { + _transformFeedbackBuffers[i].BindTransformFeedbackBuffer(Gd, Cbs, (uint)i); + } + + _needsTransformFeedbackBuffersRebind = false; + } + + if (_vertexBuffersDirty != 0) + { + while (_vertexBuffersDirty != 0) + { + int i = BitOperations.TrailingZeroCount(_vertexBuffersDirty); + + _vertexBuffers[i].BindVertexBuffer(Gd, Cbs, (uint)i, ref _newState, _vertexBufferUpdater); + + _vertexBuffersDirty &= ~(1UL << i); + } + + _vertexBufferUpdater.Commit(Cbs); + } + + if (_bindingBarriersDirty) + { + // Stale barriers may have been activated by switching program. Emit any that are relevant. + _descriptorSetUpdater.InsertBindingBarriers(Cbs); + + _bindingBarriersDirty = false; + } + + if (UpdateFeedbackLoop() || _graphicsStateDirty || Pbp != PipelineBindPoint.Graphics) + { + if (!CreatePipeline(PipelineBindPoint.Graphics)) + { + return false; + } + + _graphicsStateDirty = false; + Pbp = PipelineBindPoint.Graphics; + } + + Gd.Barriers.Flush(Cbs, _program, _feedbackLoop != 0, RenderPassActive, _rpHolder, EndRenderPassDelegate); + + _descriptorSetUpdater.UpdateAndBindDescriptorSets(Cbs, PipelineBindPoint.Graphics); + + return true; + } + + private bool CreatePipeline(PipelineBindPoint pbp) + { + // We can only create a pipeline if the have the shader stages set. + if (_newState.Stages != null) + { + if (pbp == PipelineBindPoint.Graphics && _renderPass == null) + { + CreateRenderPass(); + } + + if (!_program.IsLinked) + { + // Background compile failed, we likely can't create the pipeline because the shader is broken + // or the driver failed to compile it. + + return false; + } + + var pipeline = pbp == PipelineBindPoint.Compute + ? _newState.CreateComputePipeline(Gd, Device, _program, PipelineCache) + : _newState.CreateGraphicsPipeline(Gd, Device, _program, PipelineCache, _renderPass.Get(Cbs).Value); + + if (pipeline == null) + { + // Host failed to create the pipeline, likely due to driver bugs. + + return false; + } + + ulong pipelineHandle = pipeline.GetUnsafe().Value.Handle; + + if (_currentPipelineHandle != pipelineHandle) + { + _currentPipelineHandle = pipelineHandle; + Pipeline = pipeline; + + PauseTransformFeedbackInternal(); + Gd.Api.CmdBindPipeline(CommandBuffer, pbp, Pipeline.Get(Cbs).Value); + } + } + + return true; + } + + private unsafe void BeginRenderPass() + { + if (!RenderPassActive) + { + FramebufferParams.InsertLoadOpBarriers(Gd, Cbs); + + var renderArea = new Rect2D(null, new Extent2D(FramebufferParams.Width, FramebufferParams.Height)); + var clearValue = new ClearValue(); + + var renderPassBeginInfo = new RenderPassBeginInfo + { + SType = StructureType.RenderPassBeginInfo, + RenderPass = _renderPass.Get(Cbs).Value, + Framebuffer = _framebuffer.Get(Cbs).Value, + RenderArea = renderArea, + PClearValues = &clearValue, + ClearValueCount = 1, + }; + + Gd.Api.CmdBeginRenderPass(CommandBuffer, in renderPassBeginInfo, SubpassContents.Inline); + RenderPassActive = true; + } + } + + public void EndRenderPass() + { + if (RenderPassActive) + { + FramebufferParams.AddStoreOpUsage(); + + PauseTransformFeedbackInternal(); + Gd.Api.CmdEndRenderPass(CommandBuffer); + SignalRenderPassEnd(); + RenderPassActive = false; + } + } + + protected virtual void SignalRenderPassEnd() + { + } + + private void PauseTransformFeedbackInternal() + { + if (_tfEnabled && _tfActive) + { + EndTransformFeedbackInternal(); + _tfActive = false; + } + } + + private void ResumeTransformFeedbackInternal() + { + if (_tfEnabled && !_tfActive) + { + BeginTransformFeedbackInternal(); + _tfActive = true; + } + } + + private unsafe void BeginTransformFeedbackInternal() + { + Gd.TransformFeedbackApi.CmdBeginTransformFeedback(CommandBuffer, 0, 0, null, null); + } + + private unsafe void EndTransformFeedbackInternal() + { + Gd.TransformFeedbackApi.CmdEndTransformFeedback(CommandBuffer, 0, 0, null, null); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + _nullRenderPass?.Dispose(); + _newState.Dispose(); + _descriptorSetUpdater.Dispose(); + _vertexBufferUpdater.Dispose(); + + for (int i = 0; i < _vertexBuffers.Length; i++) + { + _vertexBuffers[i].Dispose(); + } + + for (int i = 0; i < _transformFeedbackBuffers.Length; i++) + { + _transformFeedbackBuffers[i].Dispose(); + } + + Pipeline?.Dispose(); + + unsafe + { + Gd.Api.DestroyPipelineCache(Device, PipelineCache, null); + } + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineConverter.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineConverter.cs new file mode 100644 index 000000000..96c9bbdf6 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineConverter.cs @@ -0,0 +1,336 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using Format = Silk.NET.Vulkan.Format; +using PolygonMode = Silk.NET.Vulkan.PolygonMode; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class PipelineConverter + { + public static unsafe DisposableRenderPass ToRenderPass(this ProgramPipelineState state, VulkanRenderer gd, Device device) + { + const int MaxAttachments = Constants.MaxRenderTargets + 1; + + AttachmentDescription[] attachmentDescs = null; + + var subpass = new SubpassDescription + { + PipelineBindPoint = PipelineBindPoint.Graphics, + }; + + AttachmentReference* attachmentReferences = stackalloc AttachmentReference[MaxAttachments]; + + Span attachmentIndices = stackalloc int[MaxAttachments]; + Span attachmentFormats = stackalloc Format[MaxAttachments]; + + int attachmentCount = 0; + int colorCount = 0; + int maxColorAttachmentIndex = -1; + + bool isNotMsOrSupportsStorage = gd.Capabilities.SupportsShaderStorageImageMultisample || + !state.DepthStencilFormat.IsImageCompatible(); + + for (int i = 0; i < state.AttachmentEnable.Length; i++) + { + if (state.AttachmentEnable[i]) + { + bool isNotMsOrSupportsStorageAttachments = gd.Capabilities.SupportsShaderStorageImageMultisample || + !state.AttachmentFormats[i].IsImageCompatible(); + + attachmentFormats[attachmentCount] = gd.FormatCapabilities.ConvertToVkFormat(state.AttachmentFormats[i], isNotMsOrSupportsStorageAttachments); + + attachmentIndices[attachmentCount++] = i; + colorCount++; + maxColorAttachmentIndex = i; + } + } + + if (state.DepthStencilEnable) + { + attachmentFormats[attachmentCount++] = gd.FormatCapabilities.ConvertToVkFormat(state.DepthStencilFormat, isNotMsOrSupportsStorage); + } + + if (attachmentCount != 0) + { + attachmentDescs = new AttachmentDescription[attachmentCount]; + + for (int i = 0; i < attachmentCount; i++) + { + int bindIndex = attachmentIndices[i]; + + attachmentDescs[i] = new AttachmentDescription( + 0, + attachmentFormats[i], + TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, (uint)state.SamplesCount), + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + ImageLayout.General, + ImageLayout.General); + } + + int colorAttachmentsCount = colorCount; + + if (colorAttachmentsCount > MaxAttachments - 1) + { + colorAttachmentsCount = MaxAttachments - 1; + } + + if (colorAttachmentsCount != 0) + { + subpass.ColorAttachmentCount = (uint)maxColorAttachmentIndex + 1; + subpass.PColorAttachments = &attachmentReferences[0]; + + // Fill with VK_ATTACHMENT_UNUSED to cover any gaps. + for (int i = 0; i <= maxColorAttachmentIndex; i++) + { + subpass.PColorAttachments[i] = new AttachmentReference(Vk.AttachmentUnused, ImageLayout.Undefined); + } + + for (int i = 0; i < colorAttachmentsCount; i++) + { + int bindIndex = attachmentIndices[i]; + + subpass.PColorAttachments[bindIndex] = new AttachmentReference((uint)i, ImageLayout.General); + } + } + + if (state.DepthStencilEnable) + { + uint dsIndex = (uint)attachmentCount - 1; + + subpass.PDepthStencilAttachment = &attachmentReferences[MaxAttachments - 1]; + *subpass.PDepthStencilAttachment = new AttachmentReference(dsIndex, ImageLayout.General); + } + } + + var subpassDependency = CreateSubpassDependency(gd); + + fixed (AttachmentDescription* pAttachmentDescs = attachmentDescs) + { + var renderPassCreateInfo = new RenderPassCreateInfo + { + SType = StructureType.RenderPassCreateInfo, + PAttachments = pAttachmentDescs, + AttachmentCount = attachmentDescs != null ? (uint)attachmentDescs.Length : 0, + PSubpasses = &subpass, + SubpassCount = 1, + PDependencies = &subpassDependency, + DependencyCount = 1, + }; + + gd.Api.CreateRenderPass(device, in renderPassCreateInfo, null, out var renderPass).ThrowOnError(); + + return new DisposableRenderPass(gd.Api, device, renderPass); + } + } + + public static SubpassDependency CreateSubpassDependency(VulkanRenderer gd) + { + var (access, stages) = BarrierBatch.GetSubpassAccessSuperset(gd); + + return new SubpassDependency( + 0, + 0, + stages, + stages, + access, + access, + 0); + } + + public unsafe static SubpassDependency2 CreateSubpassDependency2(VulkanRenderer gd) + { + var (access, stages) = BarrierBatch.GetSubpassAccessSuperset(gd); + + return new SubpassDependency2( + StructureType.SubpassDependency2, + null, + 0, + 0, + stages, + stages, + access, + access, + 0); + } + + public static PipelineState ToVulkanPipelineState(this ProgramPipelineState state, VulkanRenderer gd) + { + PipelineState pipeline = new(); + pipeline.Initialize(); + + // It is assumed that Dynamic State is enabled when this conversion is used. + + pipeline.CullMode = state.CullEnable ? state.CullMode.Convert() : CullModeFlags.None; + + pipeline.DepthBoundsTestEnable = false; // Not implemented. + + pipeline.DepthClampEnable = state.DepthClampEnable; + + pipeline.DepthTestEnable = state.DepthTest.TestEnable; + pipeline.DepthWriteEnable = state.DepthTest.WriteEnable; + pipeline.DepthCompareOp = state.DepthTest.Func.Convert(); + pipeline.DepthMode = state.DepthMode == DepthMode.MinusOneToOne; + + pipeline.FrontFace = state.FrontFace.Convert(); + + pipeline.HasDepthStencil = state.DepthStencilEnable; + pipeline.LineWidth = state.LineWidth; + pipeline.LogicOpEnable = state.LogicOpEnable; + pipeline.LogicOp = state.LogicOp.Convert(); + + pipeline.PatchControlPoints = state.PatchControlPoints; + pipeline.PolygonMode = PolygonMode.Fill; // Not implemented. + pipeline.PrimitiveRestartEnable = state.PrimitiveRestartEnable; + pipeline.RasterizerDiscardEnable = state.RasterizerDiscard; + pipeline.SamplesCount = (uint)state.SamplesCount; + + if (gd.Capabilities.SupportsMultiView) + { + pipeline.ScissorsCount = Constants.MaxViewports; + pipeline.ViewportsCount = Constants.MaxViewports; + } + else + { + pipeline.ScissorsCount = 1; + pipeline.ViewportsCount = 1; + } + + pipeline.DepthBiasEnable = state.BiasEnable != 0; + + // Stencil masks and ref are dynamic, so are 0 in the Vulkan pipeline. + + pipeline.StencilFrontFailOp = state.StencilTest.FrontSFail.Convert(); + pipeline.StencilFrontPassOp = state.StencilTest.FrontDpPass.Convert(); + pipeline.StencilFrontDepthFailOp = state.StencilTest.FrontDpFail.Convert(); + pipeline.StencilFrontCompareOp = state.StencilTest.FrontFunc.Convert(); + + pipeline.StencilBackFailOp = state.StencilTest.BackSFail.Convert(); + pipeline.StencilBackPassOp = state.StencilTest.BackDpPass.Convert(); + pipeline.StencilBackDepthFailOp = state.StencilTest.BackDpFail.Convert(); + pipeline.StencilBackCompareOp = state.StencilTest.BackFunc.Convert(); + + pipeline.StencilTestEnable = state.StencilTest.TestEnable; + + pipeline.Topology = gd.TopologyRemap(state.Topology).Convert(); + + int vaCount = Math.Min(Constants.MaxVertexAttributes, state.VertexAttribCount); + int vbCount = Math.Min(Constants.MaxVertexBuffers, state.VertexBufferCount); + + Span vbScalarSizes = stackalloc int[vbCount]; + + for (int i = 0; i < vaCount; i++) + { + var attribute = state.VertexAttribs[i]; + var bufferIndex = attribute.IsZero ? 0 : attribute.BufferIndex + 1; + + pipeline.Internal.VertexAttributeDescriptions[i] = new VertexInputAttributeDescription( + (uint)i, + (uint)bufferIndex, + gd.FormatCapabilities.ConvertToVertexVkFormat(attribute.Format), + (uint)attribute.Offset); + + if (!attribute.IsZero && bufferIndex < vbCount) + { + vbScalarSizes[bufferIndex - 1] = Math.Max(attribute.Format.GetScalarSize(), vbScalarSizes[bufferIndex - 1]); + } + } + + int descriptorIndex = 1; + pipeline.Internal.VertexBindingDescriptions[0] = new VertexInputBindingDescription(0, 0, VertexInputRate.Vertex); + + for (int i = 0; i < vbCount; i++) + { + var vertexBuffer = state.VertexBuffers[i]; + + if (vertexBuffer.Enable) + { + var inputRate = vertexBuffer.Divisor != 0 ? VertexInputRate.Instance : VertexInputRate.Vertex; + + int alignedStride = vertexBuffer.Stride; + + if (gd.NeedsVertexBufferAlignment(vbScalarSizes[i], out int alignment)) + { + alignedStride = BitUtils.AlignUp(vertexBuffer.Stride, alignment); + } + + // TODO: Support divisor > 1 + pipeline.Internal.VertexBindingDescriptions[descriptorIndex++] = new VertexInputBindingDescription( + (uint)i + 1, + (uint)alignedStride, + inputRate); + } + } + + pipeline.VertexBindingDescriptionsCount = (uint)descriptorIndex; + + // NOTE: Viewports, Scissors are dynamic. + + for (int i = 0; i < Constants.MaxRenderTargets; i++) + { + var blend = state.BlendDescriptors[i]; + + if (blend.Enable && state.ColorWriteMask[i] != 0) + { + pipeline.Internal.ColorBlendAttachmentState[i] = new PipelineColorBlendAttachmentState( + blend.Enable, + blend.ColorSrcFactor.Convert(), + blend.ColorDstFactor.Convert(), + blend.ColorOp.Convert(), + blend.AlphaSrcFactor.Convert(), + blend.AlphaDstFactor.Convert(), + blend.AlphaOp.Convert(), + (ColorComponentFlags)state.ColorWriteMask[i]); + } + else + { + pipeline.Internal.ColorBlendAttachmentState[i] = new PipelineColorBlendAttachmentState( + colorWriteMask: (ColorComponentFlags)state.ColorWriteMask[i]); + } + } + + int attachmentCount = 0; + int maxColorAttachmentIndex = -1; + uint attachmentIntegerFormatMask = 0; + bool allFormatsFloatOrSrgb = true; + + for (int i = 0; i < Constants.MaxRenderTargets; i++) + { + if (state.AttachmentEnable[i]) + { + bool isNotMsOrSupportsStorage = gd.Capabilities.SupportsShaderStorageImageMultisample || + !state.AttachmentFormats[i].IsImageCompatible(); + + pipeline.Internal.AttachmentFormats[attachmentCount++] = gd.FormatCapabilities.ConvertToVkFormat(state.AttachmentFormats[i], isNotMsOrSupportsStorage); + maxColorAttachmentIndex = i; + + if (state.AttachmentFormats[i].IsInteger()) + { + attachmentIntegerFormatMask |= 1u << i; + } + + allFormatsFloatOrSrgb &= state.AttachmentFormats[i].IsFloatOrSrgb(); + } + } + + if (state.DepthStencilEnable) + { + bool isNotMsOrSupportsStorage = !state.DepthStencilFormat.IsImageCompatible() || + gd.Capabilities.SupportsShaderStorageImageMultisample; + + pipeline.Internal.AttachmentFormats[attachmentCount++] = gd.FormatCapabilities.ConvertToVkFormat(state.DepthStencilFormat, isNotMsOrSupportsStorage); + } + + pipeline.ColorBlendAttachmentStateCount = (uint)(maxColorAttachmentIndex + 1); + pipeline.VertexAttributeDescriptionsCount = (uint)Math.Min(Constants.MaxVertexAttributes, state.VertexAttribCount); + pipeline.Internal.AttachmentIntegerFormatMask = attachmentIntegerFormatMask; + pipeline.Internal.LogicOpsAllowed = attachmentCount == 0 || !allFormatsFloatOrSrgb; + + return pipeline; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineDynamicState.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineDynamicState.cs new file mode 100644 index 000000000..74c330a78 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineDynamicState.cs @@ -0,0 +1,203 @@ +using Ryujinx.Common.Memory; +using Silk.NET.Vulkan; +using Silk.NET.Vulkan.Extensions.EXT; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + struct PipelineDynamicState + { + private float _depthBiasSlopeFactor; + private float _depthBiasConstantFactor; + private float _depthBiasClamp; + + public int ScissorsCount; + private Array16 _scissors; + + private uint _backCompareMask; + private uint _backWriteMask; + private uint _backReference; + private uint _frontCompareMask; + private uint _frontWriteMask; + private uint _frontReference; + + private Array4 _blendConstants; + + private FeedbackLoopAspects _feedbackLoopAspects; + + public uint ViewportsCount; + public Array16 Viewports; + + private enum DirtyFlags + { + None = 0, + Blend = 1 << 0, + DepthBias = 1 << 1, + Scissor = 1 << 2, + Stencil = 1 << 3, + Viewport = 1 << 4, + FeedbackLoop = 1 << 5, + All = Blend | DepthBias | Scissor | Stencil | Viewport | FeedbackLoop, + } + + private DirtyFlags _dirty; + + public void SetBlendConstants(float r, float g, float b, float a) + { + _blendConstants[0] = r; + _blendConstants[1] = g; + _blendConstants[2] = b; + _blendConstants[3] = a; + + _dirty |= DirtyFlags.Blend; + } + + public void SetDepthBias(float slopeFactor, float constantFactor, float clamp) + { + _depthBiasSlopeFactor = slopeFactor; + _depthBiasConstantFactor = constantFactor; + _depthBiasClamp = clamp; + + _dirty |= DirtyFlags.DepthBias; + } + + public void SetScissor(int index, Rect2D scissor) + { + _scissors[index] = scissor; + + _dirty |= DirtyFlags.Scissor; + } + + public void SetStencilMasks( + uint backCompareMask, + uint backWriteMask, + uint backReference, + uint frontCompareMask, + uint frontWriteMask, + uint frontReference) + { + _backCompareMask = backCompareMask; + _backWriteMask = backWriteMask; + _backReference = backReference; + _frontCompareMask = frontCompareMask; + _frontWriteMask = frontWriteMask; + _frontReference = frontReference; + + _dirty |= DirtyFlags.Stencil; + } + + public void SetViewport(int index, Viewport viewport) + { + Viewports[index] = viewport; + + _dirty |= DirtyFlags.Viewport; + } + + public void SetViewports(ref Array16 viewports, uint viewportsCount) + { + Viewports = viewports; + ViewportsCount = viewportsCount; + + if (ViewportsCount != 0) + { + _dirty |= DirtyFlags.Viewport; + } + } + + public void SetFeedbackLoop(FeedbackLoopAspects aspects) + { + _feedbackLoopAspects = aspects; + + _dirty |= DirtyFlags.FeedbackLoop; + } + + public void ForceAllDirty() + { + _dirty = DirtyFlags.All; + } + + public void ReplayIfDirty(VulkanRenderer gd, CommandBuffer commandBuffer) + { + Vk api = gd.Api; + + if (_dirty.HasFlag(DirtyFlags.Blend)) + { + RecordBlend(api, commandBuffer); + } + + if (_dirty.HasFlag(DirtyFlags.DepthBias)) + { + RecordDepthBias(api, commandBuffer); + } + + if (_dirty.HasFlag(DirtyFlags.Scissor)) + { + RecordScissor(api, commandBuffer); + } + + if (_dirty.HasFlag(DirtyFlags.Stencil)) + { + RecordStencilMasks(api, commandBuffer); + } + + if (_dirty.HasFlag(DirtyFlags.Viewport)) + { + RecordViewport(api, commandBuffer); + } + + if (_dirty.HasFlag(DirtyFlags.FeedbackLoop) && gd.Capabilities.SupportsDynamicAttachmentFeedbackLoop) + { + RecordFeedbackLoop(gd.DynamicFeedbackLoopApi, commandBuffer); + } + + _dirty = DirtyFlags.None; + } + + private void RecordBlend(Vk api, CommandBuffer commandBuffer) + { + api.CmdSetBlendConstants(commandBuffer, _blendConstants.AsSpan()); + } + + private readonly void RecordDepthBias(Vk api, CommandBuffer commandBuffer) + { + api.CmdSetDepthBias(commandBuffer, _depthBiasConstantFactor, _depthBiasClamp, _depthBiasSlopeFactor); + } + + private void RecordScissor(Vk api, CommandBuffer commandBuffer) + { + if (ScissorsCount != 0) + { + api.CmdSetScissor(commandBuffer, 0, (uint)ScissorsCount, _scissors.AsSpan()); + } + } + + private readonly void RecordStencilMasks(Vk api, CommandBuffer commandBuffer) + { + api.CmdSetStencilCompareMask(commandBuffer, StencilFaceFlags.FaceBackBit, _backCompareMask); + api.CmdSetStencilWriteMask(commandBuffer, StencilFaceFlags.FaceBackBit, _backWriteMask); + api.CmdSetStencilReference(commandBuffer, StencilFaceFlags.FaceBackBit, _backReference); + api.CmdSetStencilCompareMask(commandBuffer, StencilFaceFlags.FaceFrontBit, _frontCompareMask); + api.CmdSetStencilWriteMask(commandBuffer, StencilFaceFlags.FaceFrontBit, _frontWriteMask); + api.CmdSetStencilReference(commandBuffer, StencilFaceFlags.FaceFrontBit, _frontReference); + } + + private void RecordViewport(Vk api, CommandBuffer commandBuffer) + { + if (ViewportsCount != 0) + { + api.CmdSetViewport(commandBuffer, 0, ViewportsCount, Viewports.AsSpan()); + } + } + + private readonly void RecordFeedbackLoop(ExtAttachmentFeedbackLoopDynamicState api, CommandBuffer commandBuffer) + { + ImageAspectFlags aspects = (_feedbackLoopAspects & FeedbackLoopAspects.Color) != 0 ? ImageAspectFlags.ColorBit : 0; + + if ((_feedbackLoopAspects & FeedbackLoopAspects.Depth) != 0) + { + aspects |= ImageAspectFlags.DepthBit | ImageAspectFlags.StencilBit; + } + + api.CmdSetAttachmentFeedbackLoopEnable(commandBuffer, aspects); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineFull.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineFull.cs new file mode 100644 index 000000000..248a89c3a --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineFull.cs @@ -0,0 +1,351 @@ +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Rdna3Vulkan.Queries; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class PipelineFull : PipelineBase, IPipeline + { + private const ulong MinByteWeightForFlush = 256 * 1024 * 1024; // MiB + + private readonly List<(QueryPool, bool)> _activeQueries; + private CounterQueueEvent _activeConditionalRender; + + private readonly List _pendingQueryCopies; + private readonly List _activeBufferMirrors; + + private ulong _byteWeight; + + private readonly List _backingSwaps; + + public PipelineFull(VulkanRenderer gd, Device device) : base(gd, device) + { + _activeQueries = new List<(QueryPool, bool)>(); + _pendingQueryCopies = new(); + _backingSwaps = new(); + _activeBufferMirrors = new(); + + CommandBuffer = (Cbs = gd.CommandBufferPool.Rent()).CommandBuffer; + + IsMainPipeline = true; + } + + private void CopyPendingQuery() + { + foreach (var query in _pendingQueryCopies) + { + query.PoolCopy(Cbs); + } + + _pendingQueryCopies.Clear(); + } + + public void ClearRenderTargetColor(int index, int layer, int layerCount, uint componentMask, ColorF color) + { + if (FramebufferParams == null) + { + return; + } + + if (componentMask != 0xf || Gd.IsQualcommProprietary) + { + // We can't use CmdClearAttachments if not writing all components, + // because on Vulkan, the pipeline state does not affect clears. + // On proprietary Adreno drivers, CmdClearAttachments appears to execute out of order, so it's better to not use it at all. + var dstTexture = FramebufferParams.GetColorView(index); + if (dstTexture == null) + { + return; + } + + Span clearColor = stackalloc float[4]; + clearColor[0] = color.Red; + clearColor[1] = color.Green; + clearColor[2] = color.Blue; + clearColor[3] = color.Alpha; + + // TODO: Clear only the specified layer. + Gd.HelperShader.Clear( + Gd, + dstTexture, + clearColor, + componentMask, + (int)FramebufferParams.Width, + (int)FramebufferParams.Height, + FramebufferParams.GetAttachmentComponentType(index), + ClearScissor); + } + else + { + ClearRenderTargetColor(index, layer, layerCount, color); + } + } + + public void ClearRenderTargetDepthStencil(int layer, int layerCount, float depthValue, bool depthMask, int stencilValue, int stencilMask) + { + if (FramebufferParams == null) + { + return; + } + + if ((stencilMask != 0 && stencilMask != 0xff) || Gd.IsQualcommProprietary) + { + // We can't use CmdClearAttachments if not clearing all (mask is all ones, 0xFF) or none (mask is 0) of the stencil bits, + // because on Vulkan, the pipeline state does not affect clears. + // On proprietary Adreno drivers, CmdClearAttachments appears to execute out of order, so it's better to not use it at all. + var dstTexture = FramebufferParams.GetDepthStencilView(); + if (dstTexture == null) + { + return; + } + + // TODO: Clear only the specified layer. + Gd.HelperShader.Clear( + Gd, + dstTexture, + depthValue, + depthMask, + stencilValue, + stencilMask, + (int)FramebufferParams.Width, + (int)FramebufferParams.Height, + FramebufferParams.AttachmentFormats[FramebufferParams.AttachmentsCount - 1], + ClearScissor); + } + else + { + ClearRenderTargetDepthStencil(layer, layerCount, depthValue, depthMask, stencilValue, stencilMask != 0); + } + } + + public void EndHostConditionalRendering() + { + if (Gd.Capabilities.SupportsConditionalRendering) + { + // Gd.ConditionalRenderingApi.CmdEndConditionalRendering(CommandBuffer); + } + else + { + // throw new NotSupportedException(); + } + + _activeConditionalRender?.ReleaseHostAccess(); + _activeConditionalRender = null; + } + + public bool TryHostConditionalRendering(ICounterEvent value, ulong compare, bool isEqual) + { + // Compare an event and a constant value. + if (value is CounterQueueEvent evt) + { + // Easy host conditional rendering when the check matches what GL can do: + // - Event is of type samples passed. + // - Result is not a combination of multiple queries. + // - Comparing against 0. + // - Event has not already been flushed. + + if (compare == 0 && evt.Type == CounterType.SamplesPassed && evt.ClearCounter) + { + if (!value.ReserveForHostAccess()) + { + // If the event has been flushed, then just use the values on the CPU. + // The query object may already be repurposed for another draw (eg. begin + end). + return false; + } + + if (Gd.Capabilities.SupportsConditionalRendering) + { + // var buffer = evt.GetBuffer().Get(Cbs, 0, sizeof(long)).Value; + // var flags = isEqual ? ConditionalRenderingFlagsEXT.InvertedBitExt : 0; + + // var conditionalRenderingBeginInfo = new ConditionalRenderingBeginInfoEXT + // { + // SType = StructureType.ConditionalRenderingBeginInfoExt, + // Buffer = buffer, + // Flags = flags, + // }; + + // Gd.ConditionalRenderingApi.CmdBeginConditionalRendering(CommandBuffer, conditionalRenderingBeginInfo); + } + + _activeConditionalRender = evt; + return true; + } + } + + // The GPU will flush the queries to CPU and evaluate the condition there instead. + + FlushPendingQuery(); // The thread will be stalled manually flushing the counter, so flush commands now. + return false; + } + + public bool TryHostConditionalRendering(ICounterEvent value, ICounterEvent compare, bool isEqual) + { + FlushPendingQuery(); // The thread will be stalled manually flushing the counter, so flush commands now. + return false; + } + + private void FlushPendingQuery() + { + if (AutoFlush.ShouldFlushQuery()) + { + FlushCommandsImpl(); + } + } + + public CommandBufferScoped GetPreloadCommandBuffer() + { + PreloadCbs ??= Gd.CommandBufferPool.Rent(); + + return PreloadCbs.Value; + } + + public void FlushCommandsIfWeightExceeding(IAuto disposedResource, ulong byteWeight) + { + bool usedByCurrentCb = disposedResource.HasCommandBufferDependency(Cbs); + + if (PreloadCbs != null && !usedByCurrentCb) + { + usedByCurrentCb = disposedResource.HasCommandBufferDependency(PreloadCbs.Value); + } + + if (usedByCurrentCb) + { + // Since we can only free memory after the command buffer that uses a given resource was executed, + // keeping the command buffer might cause a high amount of memory to be in use. + // To prevent that, we force submit command buffers if the memory usage by resources + // in use by the current command buffer is above a given limit, and those resources were disposed. + _byteWeight += byteWeight; + + if (_byteWeight >= MinByteWeightForFlush) + { + FlushCommandsImpl(); + } + } + } + + public void Restore() + { + if (Pipeline != null) + { + Gd.Api.CmdBindPipeline(CommandBuffer, Pbp, Pipeline.Get(Cbs).Value); + } + + SignalCommandBufferChange(); + + if (Pipeline != null && Pbp == PipelineBindPoint.Graphics) + { + DynamicState.ReplayIfDirty(Gd, CommandBuffer); + } + } + + public void FlushCommandsImpl() + { + AutoFlush.RegisterFlush(DrawCount); + EndRenderPass(); + + foreach ((var queryPool, _) in _activeQueries) + { + Gd.Api.CmdEndQuery(CommandBuffer, queryPool, 0); + } + + _byteWeight = 0; + + if (PreloadCbs != null) + { + PreloadCbs.Value.Dispose(); + PreloadCbs = null; + } + + Gd.Barriers.Flush(Cbs, false, null, null); + CommandBuffer = (Cbs = Gd.CommandBufferPool.ReturnAndRent(Cbs)).CommandBuffer; + Gd.RegisterFlush(); + + // Restore per-command buffer state. + foreach (BufferHolder buffer in _activeBufferMirrors) + { + buffer.ClearMirrors(); + } + + _activeBufferMirrors.Clear(); + + foreach ((var queryPool, var isOcclusion) in _activeQueries) + { + bool isPrecise = Gd.Capabilities.SupportsPreciseOcclusionQueries && isOcclusion; + + Gd.Api.CmdResetQueryPool(CommandBuffer, queryPool, 0, 1); + Gd.Api.CmdBeginQuery(CommandBuffer, queryPool, 0, isPrecise ? QueryControlFlags.PreciseBit : 0); + } + + Gd.ResetCounterPool(); + + Restore(); + } + + public void RegisterActiveMirror(BufferHolder buffer) + { + _activeBufferMirrors.Add(buffer); + } + + public void BeginQuery(BufferedQuery query, QueryPool pool, bool needsReset, bool isOcclusion, bool fromSamplePool) + { + if (needsReset) + { + EndRenderPass(); + + Gd.Api.CmdResetQueryPool(CommandBuffer, pool, 0, 1); + + if (fromSamplePool) + { + // Try reset some additional queries in advance. + + Gd.ResetFutureCounters(CommandBuffer, AutoFlush.GetRemainingQueries()); + } + } + + bool isPrecise = Gd.Capabilities.SupportsPreciseOcclusionQueries && isOcclusion; + Gd.Api.CmdBeginQuery(CommandBuffer, pool, 0, isPrecise ? QueryControlFlags.PreciseBit : 0); + + _activeQueries.Add((pool, isOcclusion)); + } + + public void EndQuery(QueryPool pool) + { + Gd.Api.CmdEndQuery(CommandBuffer, pool, 0); + + for (int i = 0; i < _activeQueries.Count; i++) + { + if (_activeQueries[i].Item1.Handle == pool.Handle) + { + _activeQueries.RemoveAt(i); + break; + } + } + } + + public void CopyQueryResults(BufferedQuery query) + { + _pendingQueryCopies.Add(query); + + if (AutoFlush.RegisterPendingQuery()) + { + FlushCommandsImpl(); + } + } + + protected override void SignalAttachmentChange() + { + if (AutoFlush.ShouldFlushAttachmentChange(DrawCount)) + { + FlushCommandsImpl(); + } + } + + protected override void SignalRenderPassEnd() + { + CopyPendingQuery(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineHelperShader.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineHelperShader.cs new file mode 100644 index 000000000..f196c2f6e --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineHelperShader.cs @@ -0,0 +1,54 @@ +using Silk.NET.Vulkan; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class PipelineHelperShader : PipelineBase + { + public PipelineHelperShader(VulkanRenderer gd, Device device) : base(gd, device) + { + } + + public void SetRenderTarget(TextureView view, uint width, uint height) + { + CreateFramebuffer(view, width, height); + CreateRenderPass(); + SignalStateChange(); + } + + private void CreateFramebuffer(TextureView view, uint width, uint height) + { + FramebufferParams = new FramebufferParams(Device, view, width, height); + UpdatePipelineAttachmentFormats(); + } + + public void SetCommandBuffer(CommandBufferScoped cbs) + { + CommandBuffer = (Cbs = cbs).CommandBuffer; + + // Restore per-command buffer state. + + if (Pipeline != null) + { + Gd.Api.CmdBindPipeline(CommandBuffer, Pbp, Pipeline.Get(CurrentCommandBuffer).Value); + } + + SignalCommandBufferChange(); + } + + public void Finish() + { + EndRenderPass(); + } + + public void Finish(VulkanRenderer gd, CommandBufferScoped cbs) + { + Finish(); + + if (gd.PipelineInternal.IsCommandBufferActive(cbs.CommandBuffer)) + { + gd.PipelineInternal.Restore(); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCache.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCache.cs new file mode 100644 index 000000000..754854d2b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCache.cs @@ -0,0 +1,107 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Concurrent; +using System.Collections.ObjectModel; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class PipelineLayoutCache + { + private readonly struct PlceKey : IEquatable + { + public readonly ReadOnlyCollection SetDescriptors; + public readonly bool UsePushDescriptors; + + public PlceKey(ReadOnlyCollection setDescriptors, bool usePushDescriptors) + { + SetDescriptors = setDescriptors; + UsePushDescriptors = usePushDescriptors; + } + + public override int GetHashCode() + { + HashCode hasher = new(); + + if (SetDescriptors != null) + { + foreach (var setDescriptor in SetDescriptors) + { + hasher.Add(setDescriptor); + } + } + + hasher.Add(UsePushDescriptors); + + return hasher.ToHashCode(); + } + + public override bool Equals(object obj) + { + return obj is PlceKey other && Equals(other); + } + + public bool Equals(PlceKey other) + { + if ((SetDescriptors == null) != (other.SetDescriptors == null)) + { + return false; + } + + if (SetDescriptors != null) + { + if (SetDescriptors.Count != other.SetDescriptors.Count) + { + return false; + } + + for (int index = 0; index < SetDescriptors.Count; index++) + { + if (!SetDescriptors[index].Equals(other.SetDescriptors[index])) + { + return false; + } + } + } + + return UsePushDescriptors == other.UsePushDescriptors; + } + } + + private readonly ConcurrentDictionary _plces; + + public PipelineLayoutCache() + { + _plces = new ConcurrentDictionary(); + } + + public PipelineLayoutCacheEntry GetOrCreate( + VulkanRenderer gd, + Device device, + ReadOnlyCollection setDescriptors, + bool usePushDescriptors) + { + var key = new PlceKey(setDescriptors, usePushDescriptors); + + return _plces.GetOrAdd(key, newKey => new PipelineLayoutCacheEntry(gd, device, setDescriptors, usePushDescriptors)); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + foreach (var plce in _plces.Values) + { + plce.Dispose(); + } + + _plces.Clear(); + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCacheEntry.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCacheEntry.cs new file mode 100644 index 000000000..537952da8 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutCacheEntry.cs @@ -0,0 +1,383 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class PipelineLayoutCacheEntry + { + private const int MaxPoolSizesPerSet = 8; + + private readonly VulkanRenderer _gd; + private readonly Device _device; + + public DescriptorSetLayout[] DescriptorSetLayouts { get; } + public bool[] DescriptorSetLayoutsUpdateAfterBind { get; } + public PipelineLayout PipelineLayout { get; } + + private readonly int[] _consumedDescriptorsPerSet; + private readonly DescriptorPoolSize[][] _poolSizes; + + private readonly DescriptorSetManager _descriptorSetManager; + + private readonly List>[][] _dsCache; + private List>[] _currentDsCache; + private readonly int[] _dsCacheCursor; + private int _dsLastCbIndex; + private int _dsLastSubmissionCount; + + private struct ManualDescriptorSetEntry + { + public Auto DescriptorSet; + public uint CbRefMask; + public bool InUse; + + public ManualDescriptorSetEntry(Auto descriptorSet, int cbIndex) + { + DescriptorSet = descriptorSet; + CbRefMask = 1u << cbIndex; + InUse = true; + } + } + + private readonly struct PendingManualDsConsumption + { + public FenceHolder Fence { get; } + public int CommandBufferIndex { get; } + public int SetIndex { get; } + public int CacheIndex { get; } + + public PendingManualDsConsumption(FenceHolder fence, int commandBufferIndex, int setIndex, int cacheIndex) + { + Fence = fence; + CommandBufferIndex = commandBufferIndex; + SetIndex = setIndex; + CacheIndex = cacheIndex; + fence.Get(); + } + } + + private readonly List[] _manualDsCache; + private readonly Queue _pendingManualDsConsumptions; + private readonly Queue[] _freeManualDsCacheEntries; + + private readonly Dictionary _pdTemplates; + private readonly ResourceDescriptorCollection _pdDescriptors; + private long _lastPdUsage; + private DescriptorSetTemplate _lastPdTemplate; + + private PipelineLayoutCacheEntry(VulkanRenderer gd, Device device, int setsCount) + { + _gd = gd; + _device = device; + + _dsCache = new List>[CommandBufferPool.MaxCommandBuffers][]; + + for (int i = 0; i < CommandBufferPool.MaxCommandBuffers; i++) + { + _dsCache[i] = new List>[setsCount]; + + for (int j = 0; j < _dsCache[i].Length; j++) + { + _dsCache[i][j] = new List>(); + } + } + + _dsCacheCursor = new int[setsCount]; + _manualDsCache = new List[setsCount]; + _pendingManualDsConsumptions = new Queue(); + _freeManualDsCacheEntries = new Queue[setsCount]; + } + + public PipelineLayoutCacheEntry( + VulkanRenderer gd, + Device device, + ReadOnlyCollection setDescriptors, + bool usePushDescriptors) : this(gd, device, setDescriptors.Count) + { + ResourceLayouts layouts = PipelineLayoutFactory.Create(gd, device, setDescriptors, usePushDescriptors); + + DescriptorSetLayouts = layouts.DescriptorSetLayouts; + DescriptorSetLayoutsUpdateAfterBind = layouts.DescriptorSetLayoutsUpdateAfterBind; + PipelineLayout = layouts.PipelineLayout; + + _consumedDescriptorsPerSet = new int[setDescriptors.Count]; + _poolSizes = new DescriptorPoolSize[setDescriptors.Count][]; + + Span poolSizes = stackalloc DescriptorPoolSize[MaxPoolSizesPerSet]; + + for (int setIndex = 0; setIndex < setDescriptors.Count; setIndex++) + { + int count = 0; + + foreach (var descriptor in setDescriptors[setIndex].Descriptors) + { + count += descriptor.Count; + } + + _consumedDescriptorsPerSet[setIndex] = count; + _poolSizes[setIndex] = GetDescriptorPoolSizes(poolSizes, setDescriptors[setIndex], DescriptorSetManager.MaxSets).ToArray(); + } + + if (usePushDescriptors) + { + _pdDescriptors = setDescriptors[0]; + _pdTemplates = new(); + } + + _descriptorSetManager = new DescriptorSetManager(_device, setDescriptors.Count); + } + + public void UpdateCommandBufferIndex(int commandBufferIndex) + { + int submissionCount = _gd.CommandBufferPool.GetSubmissionCount(commandBufferIndex); + + if (_dsLastCbIndex != commandBufferIndex || _dsLastSubmissionCount != submissionCount) + { + _dsLastCbIndex = commandBufferIndex; + _dsLastSubmissionCount = submissionCount; + Array.Clear(_dsCacheCursor); + } + + _currentDsCache = _dsCache[commandBufferIndex]; + } + + public Auto GetNewDescriptorSetCollection(int setIndex, out bool isNew) + { + var list = _currentDsCache[setIndex]; + int index = _dsCacheCursor[setIndex]++; + if (index == list.Count) + { + var dsc = _descriptorSetManager.AllocateDescriptorSet( + _gd.Api, + DescriptorSetLayouts[setIndex], + _poolSizes[setIndex], + setIndex, + _consumedDescriptorsPerSet[setIndex], + DescriptorSetLayoutsUpdateAfterBind[setIndex]); + + list.Add(dsc); + isNew = true; + return dsc; + } + + isNew = false; + return list[index]; + } + + public Auto GetNewManualDescriptorSetCollection(CommandBufferScoped cbs, int setIndex, out int cacheIndex) + { + FreeCompletedManualDescriptorSets(); + + var list = _manualDsCache[setIndex] ??= new(); + var span = CollectionsMarshal.AsSpan(list); + + Queue freeQueue = _freeManualDsCacheEntries[setIndex]; + + // Do we have at least one freed descriptor set? If so, just use that. + if (freeQueue != null && freeQueue.TryDequeue(out int freeIndex)) + { + ref ManualDescriptorSetEntry entry = ref span[freeIndex]; + + Debug.Assert(!entry.InUse && entry.CbRefMask == 0); + + entry.InUse = true; + entry.CbRefMask = 1u << cbs.CommandBufferIndex; + cacheIndex = freeIndex; + + _pendingManualDsConsumptions.Enqueue(new PendingManualDsConsumption(cbs.GetFence(), cbs.CommandBufferIndex, setIndex, freeIndex)); + + return entry.DescriptorSet; + } + + // Otherwise create a new descriptor set, and add to our pending queue for command buffer consumption tracking. + var dsc = _descriptorSetManager.AllocateDescriptorSet( + _gd.Api, + DescriptorSetLayouts[setIndex], + _poolSizes[setIndex], + setIndex, + _consumedDescriptorsPerSet[setIndex], + DescriptorSetLayoutsUpdateAfterBind[setIndex]); + + cacheIndex = list.Count; + list.Add(new ManualDescriptorSetEntry(dsc, cbs.CommandBufferIndex)); + _pendingManualDsConsumptions.Enqueue(new PendingManualDsConsumption(cbs.GetFence(), cbs.CommandBufferIndex, setIndex, cacheIndex)); + + return dsc; + } + + public void UpdateManualDescriptorSetCollectionOwnership(CommandBufferScoped cbs, int setIndex, int cacheIndex) + { + FreeCompletedManualDescriptorSets(); + + var list = _manualDsCache[setIndex]; + var span = CollectionsMarshal.AsSpan(list); + ref var entry = ref span[cacheIndex]; + + uint cbMask = 1u << cbs.CommandBufferIndex; + + if ((entry.CbRefMask & cbMask) == 0) + { + entry.CbRefMask |= cbMask; + + _pendingManualDsConsumptions.Enqueue(new PendingManualDsConsumption(cbs.GetFence(), cbs.CommandBufferIndex, setIndex, cacheIndex)); + } + } + + private void FreeCompletedManualDescriptorSets() + { + FenceHolder signalledFence = null; + while (_pendingManualDsConsumptions.TryPeek(out var pds) && (pds.Fence == signalledFence || pds.Fence.IsSignaled())) + { + signalledFence = pds.Fence; // Already checked - don't need to do it again. + var dequeued = _pendingManualDsConsumptions.Dequeue(); + Debug.Assert(dequeued.Fence == pds.Fence); + pds.Fence.Put(); + + var span = CollectionsMarshal.AsSpan(_manualDsCache[dequeued.SetIndex]); + ref var entry = ref span[dequeued.CacheIndex]; + entry.CbRefMask &= ~(1u << dequeued.CommandBufferIndex); + + if (!entry.InUse && entry.CbRefMask == 0) + { + // If not in use by any array, and not bound to any command buffer, the descriptor set can be re-used immediately. + (_freeManualDsCacheEntries[dequeued.SetIndex] ??= new()).Enqueue(dequeued.CacheIndex); + } + } + } + + public void ReleaseManualDescriptorSetCollection(int setIndex, int cacheIndex) + { + var list = _manualDsCache[setIndex]; + var span = CollectionsMarshal.AsSpan(list); + + span[cacheIndex].InUse = false; + + if (span[cacheIndex].CbRefMask == 0) + { + // This is no longer in use by any array, so if not bound to any command buffer, the descriptor set can be re-used immediately. + (_freeManualDsCacheEntries[setIndex] ??= new()).Enqueue(cacheIndex); + } + } + + private static Span GetDescriptorPoolSizes(Span output, ResourceDescriptorCollection setDescriptor, uint multiplier) + { + int count = 0; + + for (int index = 0; index < setDescriptor.Descriptors.Count; index++) + { + ResourceDescriptor descriptor = setDescriptor.Descriptors[index]; + DescriptorType descriptorType = descriptor.Type.Convert(); + + bool found = false; + + for (int poolSizeIndex = 0; poolSizeIndex < count; poolSizeIndex++) + { + if (output[poolSizeIndex].Type == descriptorType) + { + output[poolSizeIndex].DescriptorCount += (uint)descriptor.Count * multiplier; + found = true; + break; + } + } + + if (!found) + { + output[count++] = new DescriptorPoolSize() + { + Type = descriptorType, + DescriptorCount = (uint)descriptor.Count, + }; + } + } + + return output[..count]; + } + + public DescriptorSetTemplate GetPushDescriptorTemplate(PipelineBindPoint pbp, long updateMask) + { + if (_lastPdUsage == updateMask && _lastPdTemplate != null) + { + // Most likely result is that it asks to update the same buffers. + return _lastPdTemplate; + } + + if (!_pdTemplates.TryGetValue(updateMask, out DescriptorSetTemplate template)) + { + template = new DescriptorSetTemplate(_gd, _device, _pdDescriptors, updateMask, this, pbp, 0); + + _pdTemplates.Add(updateMask, template); + } + + _lastPdUsage = updateMask; + _lastPdTemplate = template; + + return template; + } + + protected virtual unsafe void Dispose(bool disposing) + { + if (disposing) + { + if (_pdTemplates != null) + { + foreach (DescriptorSetTemplate template in _pdTemplates.Values) + { + template.Dispose(); + } + } + + for (int i = 0; i < _dsCache.Length; i++) + { + for (int j = 0; j < _dsCache[i].Length; j++) + { + for (int k = 0; k < _dsCache[i][j].Count; k++) + { + _dsCache[i][j][k].Dispose(); + } + + _dsCache[i][j].Clear(); + } + } + + for (int i = 0; i < _manualDsCache.Length; i++) + { + if (_manualDsCache[i] == null) + { + continue; + } + + for (int j = 0; j < _manualDsCache[i].Count; j++) + { + _manualDsCache[i][j].DescriptorSet.Dispose(); + } + + _manualDsCache[i].Clear(); + } + + _gd.Api.DestroyPipelineLayout(_device, PipelineLayout, null); + + for (int i = 0; i < DescriptorSetLayouts.Length; i++) + { + _gd.Api.DestroyDescriptorSetLayout(_device, DescriptorSetLayouts[i], null); + } + + while (_pendingManualDsConsumptions.TryDequeue(out var pds)) + { + pds.Fence.Put(); + } + + _descriptorSetManager.Dispose(); + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutFactory.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutFactory.cs new file mode 100644 index 000000000..04411940e --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineLayoutFactory.cs @@ -0,0 +1,115 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.ObjectModel; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + record struct ResourceLayouts(DescriptorSetLayout[] DescriptorSetLayouts, bool[] DescriptorSetLayoutsUpdateAfterBind, PipelineLayout PipelineLayout); + + static class PipelineLayoutFactory + { + public static unsafe ResourceLayouts Create( + VulkanRenderer gd, + Device device, + ReadOnlyCollection setDescriptors, + bool usePushDescriptors) + { + DescriptorSetLayout[] layouts = new DescriptorSetLayout[setDescriptors.Count]; + bool[] updateAfterBindFlags = new bool[setDescriptors.Count]; + + bool isMoltenVk = gd.IsMoltenVk; + + for (int setIndex = 0; setIndex < setDescriptors.Count; setIndex++) + { + ResourceDescriptorCollection rdc = setDescriptors[setIndex]; + + ResourceStages activeStages = ResourceStages.None; + + if (isMoltenVk) + { + for (int descIndex = 0; descIndex < rdc.Descriptors.Count; descIndex++) + { + activeStages |= rdc.Descriptors[descIndex].Stages; + } + } + + DescriptorSetLayoutBinding[] layoutBindings = new DescriptorSetLayoutBinding[rdc.Descriptors.Count]; + + bool hasArray = false; + + for (int descIndex = 0; descIndex < rdc.Descriptors.Count; descIndex++) + { + ResourceDescriptor descriptor = rdc.Descriptors[descIndex]; + ResourceStages stages = descriptor.Stages; + + if (descriptor.Type == ResourceType.StorageBuffer && isMoltenVk) + { + // There's a bug on MoltenVK where using the same buffer across different stages + // causes invalid resource errors, allow the binding on all active stages as workaround. + stages = activeStages; + } + + layoutBindings[descIndex] = new DescriptorSetLayoutBinding + { + Binding = (uint)descriptor.Binding, + DescriptorType = descriptor.Type.Convert(), + DescriptorCount = (uint)descriptor.Count, + StageFlags = stages.Convert(), + }; + + if (descriptor.Count > 1) + { + hasArray = true; + } + } + + fixed (DescriptorSetLayoutBinding* pLayoutBindings = layoutBindings) + { + DescriptorSetLayoutCreateFlags flags = DescriptorSetLayoutCreateFlags.None; + + if (usePushDescriptors && setIndex == 0) + { + flags = DescriptorSetLayoutCreateFlags.PushDescriptorBitKhr; + } + + if (gd.Vendor == Vendor.Intel && hasArray) + { + // Some vendors (like Intel) have low per-stage limits. + // We must set the flag if we exceed those limits. + flags |= DescriptorSetLayoutCreateFlags.UpdateAfterBindPoolBit; + + updateAfterBindFlags[setIndex] = true; + } + + var descriptorSetLayoutCreateInfo = new DescriptorSetLayoutCreateInfo + { + SType = StructureType.DescriptorSetLayoutCreateInfo, + PBindings = pLayoutBindings, + BindingCount = (uint)layoutBindings.Length, + Flags = flags, + }; + + gd.Api.CreateDescriptorSetLayout(device, in descriptorSetLayoutCreateInfo, null, out layouts[setIndex]).ThrowOnError(); + } + } + + PipelineLayout layout; + + fixed (DescriptorSetLayout* pLayouts = layouts) + { + var pipelineLayoutCreateInfo = new PipelineLayoutCreateInfo + { + SType = StructureType.PipelineLayoutCreateInfo, + PSetLayouts = pLayouts, + SetLayoutCount = (uint)layouts.Length, + }; + + gd.Api.CreatePipelineLayout(device, &pipelineLayoutCreateInfo, null, out layout).ThrowOnError(); + } + + return new ResourceLayouts(layouts, updateAfterBindFlags, layout); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineState.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineState.cs new file mode 100644 index 000000000..b683c6334 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineState.cs @@ -0,0 +1,732 @@ +using Ryujinx.Common.Memory; +using Silk.NET.Vulkan; +using System; +using System.Numerics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + struct PipelineState : IDisposable + { + private const int RequiredSubgroupSize = 32; + private const int MaxDynamicStatesCount = 9; + + public PipelineUid Internal; + + public float LineWidth + { + readonly get => BitConverter.Int32BitsToSingle((int)((Internal.Id0 >> 0) & 0xFFFFFFFF)); + set => Internal.Id0 = (Internal.Id0 & 0xFFFFFFFF00000000) | ((ulong)(uint)BitConverter.SingleToInt32Bits(value) << 0); + } + + public float DepthBiasClamp + { + readonly get => BitConverter.Int32BitsToSingle((int)((Internal.Id0 >> 32) & 0xFFFFFFFF)); + set => Internal.Id0 = (Internal.Id0 & 0xFFFFFFFF) | ((ulong)(uint)BitConverter.SingleToInt32Bits(value) << 32); + } + + public float DepthBiasConstantFactor + { + readonly get => BitConverter.Int32BitsToSingle((int)((Internal.Id1 >> 0) & 0xFFFFFFFF)); + set => Internal.Id1 = (Internal.Id1 & 0xFFFFFFFF00000000) | ((ulong)(uint)BitConverter.SingleToInt32Bits(value) << 0); + } + + public float DepthBiasSlopeFactor + { + readonly get => BitConverter.Int32BitsToSingle((int)((Internal.Id1 >> 32) & 0xFFFFFFFF)); + set => Internal.Id1 = (Internal.Id1 & 0xFFFFFFFF) | ((ulong)(uint)BitConverter.SingleToInt32Bits(value) << 32); + } + + public uint StencilFrontCompareMask + { + readonly get => (uint)((Internal.Id2 >> 0) & 0xFFFFFFFF); + set => Internal.Id2 = (Internal.Id2 & 0xFFFFFFFF00000000) | ((ulong)value << 0); + } + + public uint StencilFrontWriteMask + { + readonly get => (uint)((Internal.Id2 >> 32) & 0xFFFFFFFF); + set => Internal.Id2 = (Internal.Id2 & 0xFFFFFFFF) | ((ulong)value << 32); + } + + public uint StencilFrontReference + { + readonly get => (uint)((Internal.Id3 >> 0) & 0xFFFFFFFF); + set => Internal.Id3 = (Internal.Id3 & 0xFFFFFFFF00000000) | ((ulong)value << 0); + } + + public uint StencilBackCompareMask + { + readonly get => (uint)((Internal.Id3 >> 32) & 0xFFFFFFFF); + set => Internal.Id3 = (Internal.Id3 & 0xFFFFFFFF) | ((ulong)value << 32); + } + + public uint StencilBackWriteMask + { + readonly get => (uint)((Internal.Id4 >> 0) & 0xFFFFFFFF); + set => Internal.Id4 = (Internal.Id4 & 0xFFFFFFFF00000000) | ((ulong)value << 0); + } + + public uint StencilBackReference + { + readonly get => (uint)((Internal.Id4 >> 32) & 0xFFFFFFFF); + set => Internal.Id4 = (Internal.Id4 & 0xFFFFFFFF) | ((ulong)value << 32); + } + + public PolygonMode PolygonMode + { + readonly get => (PolygonMode)((Internal.Id5 >> 0) & 0x3FFFFFFF); + set => Internal.Id5 = (Internal.Id5 & 0xFFFFFFFFC0000000) | ((ulong)value << 0); + } + + public uint StagesCount + { + readonly get => (byte)((Internal.Id5 >> 30) & 0xFF); + set => Internal.Id5 = (Internal.Id5 & 0xFFFFFFC03FFFFFFF) | ((ulong)value << 30); + } + + public uint VertexAttributeDescriptionsCount + { + readonly get => (byte)((Internal.Id5 >> 38) & 0xFF); + set => Internal.Id5 = (Internal.Id5 & 0xFFFFC03FFFFFFFFF) | ((ulong)value << 38); + } + + public uint VertexBindingDescriptionsCount + { + readonly get => (byte)((Internal.Id5 >> 46) & 0xFF); + set => Internal.Id5 = (Internal.Id5 & 0xFFC03FFFFFFFFFFF) | ((ulong)value << 46); + } + + public uint ViewportsCount + { + readonly get => (byte)((Internal.Id5 >> 54) & 0xFF); + set => Internal.Id5 = (Internal.Id5 & 0xC03FFFFFFFFFFFFF) | ((ulong)value << 54); + } + + public uint ScissorsCount + { + readonly get => (byte)((Internal.Id6 >> 0) & 0xFF); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFFFFFFFF00) | ((ulong)value << 0); + } + + public uint ColorBlendAttachmentStateCount + { + readonly get => (byte)((Internal.Id6 >> 8) & 0xFF); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFFFFFF00FF) | ((ulong)value << 8); + } + + public PrimitiveTopology Topology + { + readonly get => (PrimitiveTopology)((Internal.Id6 >> 16) & 0xF); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFFFFF0FFFF) | ((ulong)value << 16); + } + + public LogicOp LogicOp + { + readonly get => (LogicOp)((Internal.Id6 >> 20) & 0xF); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFFFF0FFFFF) | ((ulong)value << 20); + } + + public CompareOp DepthCompareOp + { + readonly get => (CompareOp)((Internal.Id6 >> 24) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFFF8FFFFFF) | ((ulong)value << 24); + } + + public StencilOp StencilFrontFailOp + { + readonly get => (StencilOp)((Internal.Id6 >> 27) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFFC7FFFFFF) | ((ulong)value << 27); + } + + public StencilOp StencilFrontPassOp + { + readonly get => (StencilOp)((Internal.Id6 >> 30) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFFE3FFFFFFF) | ((ulong)value << 30); + } + + public StencilOp StencilFrontDepthFailOp + { + readonly get => (StencilOp)((Internal.Id6 >> 33) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFFF1FFFFFFFF) | ((ulong)value << 33); + } + + public CompareOp StencilFrontCompareOp + { + readonly get => (CompareOp)((Internal.Id6 >> 36) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFF8FFFFFFFFF) | ((ulong)value << 36); + } + + public StencilOp StencilBackFailOp + { + readonly get => (StencilOp)((Internal.Id6 >> 39) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFFC7FFFFFFFFF) | ((ulong)value << 39); + } + + public StencilOp StencilBackPassOp + { + readonly get => (StencilOp)((Internal.Id6 >> 42) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFFE3FFFFFFFFFF) | ((ulong)value << 42); + } + + public StencilOp StencilBackDepthFailOp + { + readonly get => (StencilOp)((Internal.Id6 >> 45) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFFF1FFFFFFFFFFF) | ((ulong)value << 45); + } + + public CompareOp StencilBackCompareOp + { + readonly get => (CompareOp)((Internal.Id6 >> 48) & 0x7); + set => Internal.Id6 = (Internal.Id6 & 0xFFF8FFFFFFFFFFFF) | ((ulong)value << 48); + } + + public CullModeFlags CullMode + { + readonly get => (CullModeFlags)((Internal.Id6 >> 51) & 0x3); + set => Internal.Id6 = (Internal.Id6 & 0xFFE7FFFFFFFFFFFF) | ((ulong)value << 51); + } + + public bool PrimitiveRestartEnable + { + readonly get => ((Internal.Id6 >> 53) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xFFDFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 53); + } + + public bool DepthClampEnable + { + readonly get => ((Internal.Id6 >> 54) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xFFBFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 54); + } + + public bool RasterizerDiscardEnable + { + readonly get => ((Internal.Id6 >> 55) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xFF7FFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 55); + } + + public FrontFace FrontFace + { + readonly get => (FrontFace)((Internal.Id6 >> 56) & 0x1); + set => Internal.Id6 = (Internal.Id6 & 0xFEFFFFFFFFFFFFFF) | ((ulong)value << 56); + } + + public bool DepthBiasEnable + { + readonly get => ((Internal.Id6 >> 57) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xFDFFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 57); + } + + public bool DepthTestEnable + { + readonly get => ((Internal.Id6 >> 58) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xFBFFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 58); + } + + public bool DepthWriteEnable + { + readonly get => ((Internal.Id6 >> 59) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xF7FFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 59); + } + + public bool DepthBoundsTestEnable + { + readonly get => ((Internal.Id6 >> 60) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xEFFFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 60); + } + + public bool StencilTestEnable + { + readonly get => ((Internal.Id6 >> 61) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xDFFFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 61); + } + + public bool LogicOpEnable + { + readonly get => ((Internal.Id6 >> 62) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0xBFFFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 62); + } + + public bool HasDepthStencil + { + readonly get => ((Internal.Id6 >> 63) & 0x1) != 0UL; + set => Internal.Id6 = (Internal.Id6 & 0x7FFFFFFFFFFFFFFF) | ((value ? 1UL : 0UL) << 63); + } + + public uint PatchControlPoints + { + readonly get => (uint)((Internal.Id7 >> 0) & 0xFFFFFFFF); + set => Internal.Id7 = (Internal.Id7 & 0xFFFFFFFF00000000) | ((ulong)value << 0); + } + + public uint SamplesCount + { + readonly get => (uint)((Internal.Id7 >> 32) & 0xFFFFFFFF); + set => Internal.Id7 = (Internal.Id7 & 0xFFFFFFFF) | ((ulong)value << 32); + } + + public bool AlphaToCoverageEnable + { + readonly get => ((Internal.Id8 >> 0) & 0x1) != 0UL; + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFFFE) | ((value ? 1UL : 0UL) << 0); + } + + public bool AlphaToOneEnable + { + readonly get => ((Internal.Id8 >> 1) & 0x1) != 0UL; + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFFFD) | ((value ? 1UL : 0UL) << 1); + } + + public bool AdvancedBlendSrcPreMultiplied + { + readonly get => ((Internal.Id8 >> 2) & 0x1) != 0UL; + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFFFB) | ((value ? 1UL : 0UL) << 2); + } + + public bool AdvancedBlendDstPreMultiplied + { + readonly get => ((Internal.Id8 >> 3) & 0x1) != 0UL; + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFFF7) | ((value ? 1UL : 0UL) << 3); + } + + public BlendOverlapEXT AdvancedBlendOverlap + { + readonly get => (BlendOverlapEXT)((Internal.Id8 >> 4) & 0x3); + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFFCF) | ((ulong)value << 4); + } + + public bool DepthMode + { + readonly get => ((Internal.Id8 >> 6) & 0x1) != 0UL; + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFFBF) | ((value ? 1UL : 0UL) << 6); + } + + public FeedbackLoopAspects FeedbackLoopAspects + { + readonly get => (FeedbackLoopAspects)((Internal.Id8 >> 7) & 0x3); + set => Internal.Id8 = (Internal.Id8 & 0xFFFFFFFFFFFFFE7F) | (((ulong)value) << 7); + } + + public bool HasTessellationControlShader; + public NativeArray Stages; + public PipelineLayout PipelineLayout; + public SpecData SpecializationData; + + private Array32 _vertexAttributeDescriptions2; + + public void Initialize() + { + HasTessellationControlShader = false; + Stages = new NativeArray(Constants.MaxShaderStages); + + AdvancedBlendSrcPreMultiplied = true; + AdvancedBlendDstPreMultiplied = true; + AdvancedBlendOverlap = BlendOverlapEXT.UncorrelatedExt; + + LineWidth = 1f; + SamplesCount = 1; + DepthMode = true; + } + + public unsafe Auto CreateComputePipeline( + VulkanRenderer gd, + Device device, + ShaderCollection program, + PipelineCache cache) + { + if (program.TryGetComputePipeline(ref SpecializationData, out var pipeline)) + { + return pipeline; + } + + var pipelineCreateInfo = new ComputePipelineCreateInfo + { + SType = StructureType.ComputePipelineCreateInfo, + Stage = Stages[0], + BasePipelineIndex = -1, + Layout = PipelineLayout, + }; + + Pipeline pipelineHandle = default; + + bool hasSpec = program.SpecDescriptions != null; + + var desc = hasSpec ? program.SpecDescriptions[0] : SpecDescription.Empty; + + if (hasSpec && SpecializationData.Length < (int)desc.Info.DataSize) + { + throw new InvalidOperationException("Specialization data size does not match description"); + } + + fixed (SpecializationInfo* info = &desc.Info) + fixed (SpecializationMapEntry* map = desc.Map) + fixed (byte* data = SpecializationData.Span) + { + if (hasSpec) + { + info->PMapEntries = map; + info->PData = data; + pipelineCreateInfo.Stage.PSpecializationInfo = info; + } + + gd.Api.CreateComputePipelines(device, cache, 1, &pipelineCreateInfo, null, &pipelineHandle).ThrowOnError(); + } + + pipeline = new Auto(new DisposablePipeline(gd.Api, device, pipelineHandle)); + + program.AddComputePipeline(ref SpecializationData, pipeline); + + return pipeline; + } + + public unsafe Auto CreateGraphicsPipeline( + VulkanRenderer gd, + Device device, + ShaderCollection program, + PipelineCache cache, + RenderPass renderPass, + bool throwOnError = false) + { + if (program.TryGetGraphicsPipeline(ref Internal, out var pipeline)) + { + return pipeline; + } + + Pipeline pipelineHandle = default; + + bool isMoltenVk = gd.IsMoltenVk; + + if (isMoltenVk) + { + UpdateVertexAttributeDescriptions(gd); + } + + fixed (VertexInputAttributeDescription* pVertexAttributeDescriptions = &Internal.VertexAttributeDescriptions[0]) + fixed (VertexInputAttributeDescription* pVertexAttributeDescriptions2 = &_vertexAttributeDescriptions2[0]) + fixed (VertexInputBindingDescription* pVertexBindingDescriptions = &Internal.VertexBindingDescriptions[0]) + fixed (PipelineColorBlendAttachmentState* pColorBlendAttachmentState = &Internal.ColorBlendAttachmentState[0]) + { + var vertexInputState = new PipelineVertexInputStateCreateInfo + { + SType = StructureType.PipelineVertexInputStateCreateInfo, + VertexAttributeDescriptionCount = VertexAttributeDescriptionsCount, + PVertexAttributeDescriptions = isMoltenVk ? pVertexAttributeDescriptions2 : pVertexAttributeDescriptions, + VertexBindingDescriptionCount = VertexBindingDescriptionsCount, + PVertexBindingDescriptions = pVertexBindingDescriptions, + }; + + // Using patches topology without a tessellation shader is invalid. + // If we find such a case, return null pipeline to skip the draw. + if (Topology == PrimitiveTopology.PatchList && !HasTessellationControlShader) + { + program.AddGraphicsPipeline(ref Internal, null); + + return null; + } + + bool primitiveRestartEnable = PrimitiveRestartEnable; + + bool topologySupportsRestart; + + if (gd.Capabilities.SupportsPrimitiveTopologyListRestart) + { + topologySupportsRestart = gd.Capabilities.SupportsPrimitiveTopologyPatchListRestart || Topology != PrimitiveTopology.PatchList; + } + else + { + topologySupportsRestart = Topology == PrimitiveTopology.LineStrip || + Topology == PrimitiveTopology.TriangleStrip || + Topology == PrimitiveTopology.TriangleFan || + Topology == PrimitiveTopology.LineStripWithAdjacency || + Topology == PrimitiveTopology.TriangleStripWithAdjacency; + } + + primitiveRestartEnable &= topologySupportsRestart; + + var inputAssemblyState = new PipelineInputAssemblyStateCreateInfo + { + SType = StructureType.PipelineInputAssemblyStateCreateInfo, + PrimitiveRestartEnable = primitiveRestartEnable, + Topology = HasTessellationControlShader ? PrimitiveTopology.PatchList : Topology, + }; + + var tessellationState = new PipelineTessellationStateCreateInfo + { + SType = StructureType.PipelineTessellationStateCreateInfo, + PatchControlPoints = PatchControlPoints, + }; + + var rasterizationState = new PipelineRasterizationStateCreateInfo + { + SType = StructureType.PipelineRasterizationStateCreateInfo, + DepthClampEnable = DepthClampEnable, + RasterizerDiscardEnable = RasterizerDiscardEnable, + PolygonMode = PolygonMode, + LineWidth = LineWidth, + CullMode = CullMode, + FrontFace = FrontFace, + DepthBiasEnable = DepthBiasEnable, + }; + + var viewportState = new PipelineViewportStateCreateInfo + { + SType = StructureType.PipelineViewportStateCreateInfo, + ViewportCount = ViewportsCount, + ScissorCount = ScissorsCount, + }; + + if (gd.Capabilities.SupportsDepthClipControl) + { + var viewportDepthClipControlState = new PipelineViewportDepthClipControlCreateInfoEXT + { + SType = StructureType.PipelineViewportDepthClipControlCreateInfoExt, + NegativeOneToOne = DepthMode, + }; + + viewportState.PNext = &viewportDepthClipControlState; + } + + var multisampleState = new PipelineMultisampleStateCreateInfo + { + SType = StructureType.PipelineMultisampleStateCreateInfo, + SampleShadingEnable = false, + RasterizationSamples = TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, SamplesCount), + MinSampleShading = 1, + AlphaToCoverageEnable = AlphaToCoverageEnable, + AlphaToOneEnable = AlphaToOneEnable, + }; + + var stencilFront = new StencilOpState( + StencilFrontFailOp, + StencilFrontPassOp, + StencilFrontDepthFailOp, + StencilFrontCompareOp); + + var stencilBack = new StencilOpState( + StencilBackFailOp, + StencilBackPassOp, + StencilBackDepthFailOp, + StencilBackCompareOp); + + var depthStencilState = new PipelineDepthStencilStateCreateInfo + { + SType = StructureType.PipelineDepthStencilStateCreateInfo, + DepthTestEnable = DepthTestEnable, + DepthWriteEnable = DepthWriteEnable, + DepthCompareOp = DepthCompareOp, + DepthBoundsTestEnable = false, + StencilTestEnable = StencilTestEnable, + Front = stencilFront, + Back = stencilBack, + }; + + uint blendEnables = 0; + + if (gd.IsMoltenVk && Internal.AttachmentIntegerFormatMask != 0) + { + // Blend can't be enabled for integer formats, so let's make sure it is disabled. + uint attachmentIntegerFormatMask = Internal.AttachmentIntegerFormatMask; + + while (attachmentIntegerFormatMask != 0) + { + int i = BitOperations.TrailingZeroCount(attachmentIntegerFormatMask); + + if (Internal.ColorBlendAttachmentState[i].BlendEnable) + { + blendEnables |= 1u << i; + } + + Internal.ColorBlendAttachmentState[i].BlendEnable = false; + attachmentIntegerFormatMask &= ~(1u << i); + } + } + + // Vendors other than NVIDIA have a bug where it enables logical operations even for float formats, + // so we need to force disable them here. + bool logicOpEnable = LogicOpEnable && (gd.Vendor == Vendor.Nvidia || Internal.LogicOpsAllowed); + + var colorBlendState = new PipelineColorBlendStateCreateInfo + { + SType = StructureType.PipelineColorBlendStateCreateInfo, + LogicOpEnable = logicOpEnable, + LogicOp = LogicOp, + AttachmentCount = ColorBlendAttachmentStateCount, + PAttachments = pColorBlendAttachmentState, + }; + + PipelineColorBlendAdvancedStateCreateInfoEXT colorBlendAdvancedState; + + if (!AdvancedBlendSrcPreMultiplied || + !AdvancedBlendDstPreMultiplied || + AdvancedBlendOverlap != BlendOverlapEXT.UncorrelatedExt) + { + colorBlendAdvancedState = new PipelineColorBlendAdvancedStateCreateInfoEXT + { + SType = StructureType.PipelineColorBlendAdvancedStateCreateInfoExt, + SrcPremultiplied = AdvancedBlendSrcPreMultiplied, + DstPremultiplied = AdvancedBlendDstPreMultiplied, + BlendOverlap = AdvancedBlendOverlap, + }; + + colorBlendState.PNext = &colorBlendAdvancedState; + } + + bool supportsExtDynamicState = gd.Capabilities.SupportsExtendedDynamicState; + bool supportsFeedbackLoopDynamicState = gd.Capabilities.SupportsDynamicAttachmentFeedbackLoop; + + DynamicState* dynamicStates = stackalloc DynamicState[MaxDynamicStatesCount]; + + int dynamicStatesCount = 7; + + dynamicStates[0] = DynamicState.Viewport; + dynamicStates[1] = DynamicState.Scissor; + dynamicStates[2] = DynamicState.DepthBias; + dynamicStates[3] = DynamicState.StencilCompareMask; + dynamicStates[4] = DynamicState.StencilWriteMask; + dynamicStates[5] = DynamicState.StencilReference; + dynamicStates[6] = DynamicState.BlendConstants; + + if (supportsExtDynamicState) + { + dynamicStates[dynamicStatesCount++] = DynamicState.VertexInputBindingStrideExt; + } + + if (supportsFeedbackLoopDynamicState) + { + dynamicStates[dynamicStatesCount++] = DynamicState.AttachmentFeedbackLoopEnableExt; + } + + var pipelineDynamicStateCreateInfo = new PipelineDynamicStateCreateInfo + { + SType = StructureType.PipelineDynamicStateCreateInfo, + DynamicStateCount = (uint)dynamicStatesCount, + PDynamicStates = dynamicStates, + }; + + PipelineCreateFlags flags = 0; + + if (gd.Capabilities.SupportsAttachmentFeedbackLoop) + { + FeedbackLoopAspects aspects = FeedbackLoopAspects; + + if ((aspects & FeedbackLoopAspects.Color) != 0) + { + flags |= PipelineCreateFlags.CreateColorAttachmentFeedbackLoopBitExt; + } + + if ((aspects & FeedbackLoopAspects.Depth) != 0) + { + flags |= PipelineCreateFlags.CreateDepthStencilAttachmentFeedbackLoopBitExt; + } + } + + var pipelineCreateInfo = new GraphicsPipelineCreateInfo + { + SType = StructureType.GraphicsPipelineCreateInfo, + Flags = flags, + StageCount = StagesCount, + PStages = Stages.Pointer, + PVertexInputState = &vertexInputState, + PInputAssemblyState = &inputAssemblyState, + PTessellationState = &tessellationState, + PViewportState = &viewportState, + PRasterizationState = &rasterizationState, + PMultisampleState = &multisampleState, + PDepthStencilState = &depthStencilState, + PColorBlendState = &colorBlendState, + PDynamicState = &pipelineDynamicStateCreateInfo, + Layout = PipelineLayout, + RenderPass = renderPass, + }; + + Result result = gd.Api.CreateGraphicsPipelines(device, cache, 1, &pipelineCreateInfo, null, &pipelineHandle); + + if (throwOnError) + { + result.ThrowOnError(); + } + else if (result.IsError()) + { + program.AddGraphicsPipeline(ref Internal, null); + + return null; + } + + // Restore previous blend enable values if we changed it. + while (blendEnables != 0) + { + int i = BitOperations.TrailingZeroCount(blendEnables); + + Internal.ColorBlendAttachmentState[i].BlendEnable = true; + blendEnables &= ~(1u << i); + } + } + + pipeline = new Auto(new DisposablePipeline(gd.Api, device, pipelineHandle)); + + program.AddGraphicsPipeline(ref Internal, pipeline); + + return pipeline; + } + + private void UpdateVertexAttributeDescriptions(VulkanRenderer gd) + { + // Vertex attributes exceeding the stride are invalid. + // In metal, they cause glitches with the vertex shader fetching incorrect values. + // To work around this, we reduce the format to something that doesn't exceed the stride if possible. + // The assumption is that the exceeding components are not actually accessed on the shader. + + for (int index = 0; index < VertexAttributeDescriptionsCount; index++) + { + var attribute = Internal.VertexAttributeDescriptions[index]; + int vbIndex = GetVertexBufferIndex(attribute.Binding); + + if (vbIndex >= 0) + { + ref var vb = ref Internal.VertexBindingDescriptions[vbIndex]; + + Format format = attribute.Format; + + while (vb.Stride != 0 && attribute.Offset + FormatTable.GetAttributeFormatSize(format) > vb.Stride) + { + Format newFormat = FormatTable.DropLastComponent(format); + + if (newFormat == format) + { + // That case means we failed to find a format that fits within the stride, + // so just restore the original format and give up. + format = attribute.Format; + break; + } + + format = newFormat; + } + + if (attribute.Format != format && gd.FormatCapabilities.BufferFormatSupports(FormatFeatureFlags.VertexBufferBit, format)) + { + attribute.Format = format; + } + } + + _vertexAttributeDescriptions2[index] = attribute; + } + } + + private int GetVertexBufferIndex(uint binding) + { + for (int index = 0; index < VertexBindingDescriptionsCount; index++) + { + if (Internal.VertexBindingDescriptions[index].Binding == binding) + { + return index; + } + } + + return -1; + } + + public readonly void Dispose() + { + Stages.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineUid.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineUid.cs new file mode 100644 index 000000000..d064e2507 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/PipelineUid.cs @@ -0,0 +1,125 @@ +using Ryujinx.Common.Memory; +using Silk.NET.Vulkan; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + struct PipelineUid : IRefEquatable + { + public ulong Id0; + public ulong Id1; + public ulong Id2; + public ulong Id3; + + public ulong Id4; + public ulong Id5; + public ulong Id6; + + public ulong Id7; + public ulong Id8; + + private readonly uint VertexAttributeDescriptionsCount => (byte)((Id5 >> 38) & 0xFF); + private readonly uint VertexBindingDescriptionsCount => (byte)((Id5 >> 46) & 0xFF); + private readonly uint ColorBlendAttachmentStateCount => (byte)((Id6 >> 8) & 0xFF); + private readonly bool HasDepthStencil => ((Id6 >> 63) & 0x1) != 0UL; + + public Array32 VertexAttributeDescriptions; + public Array33 VertexBindingDescriptions; + public Array8 ColorBlendAttachmentState; + public Array9 AttachmentFormats; + public uint AttachmentIntegerFormatMask; + public bool LogicOpsAllowed; + + public readonly override bool Equals(object obj) + { + return obj is PipelineUid other && Equals(other); + } + + public bool Equals(ref PipelineUid other) + { + if (!Unsafe.As>(ref Id0).Equals(Unsafe.As>(ref other.Id0)) || + !Unsafe.As>(ref Id4).Equals(Unsafe.As>(ref other.Id4)) || + !Unsafe.As>(ref Id7).Equals(Unsafe.As>(ref other.Id7))) + { + return false; + } + + if (!SequenceEqual(VertexAttributeDescriptions.AsSpan(), other.VertexAttributeDescriptions.AsSpan(), VertexAttributeDescriptionsCount)) + { + return false; + } + + if (!SequenceEqual(VertexBindingDescriptions.AsSpan(), other.VertexBindingDescriptions.AsSpan(), VertexBindingDescriptionsCount)) + { + return false; + } + + if (!SequenceEqual(ColorBlendAttachmentState.AsSpan(), other.ColorBlendAttachmentState.AsSpan(), ColorBlendAttachmentStateCount)) + { + return false; + } + + if (!SequenceEqual(AttachmentFormats.AsSpan(), other.AttachmentFormats.AsSpan(), ColorBlendAttachmentStateCount + (HasDepthStencil ? 1u : 0u))) + { + return false; + } + + return true; + } + + private static bool SequenceEqual(ReadOnlySpan x, ReadOnlySpan y, uint count) where T : unmanaged + { + return MemoryMarshal.Cast(x[..(int)count]).SequenceEqual(MemoryMarshal.Cast(y[..(int)count])); + } + + public override int GetHashCode() + { + ulong hash64 = Id0 * 23 ^ + Id1 * 23 ^ + Id2 * 23 ^ + Id3 * 23 ^ + Id4 * 23 ^ + Id5 * 23 ^ + Id6 * 23 ^ + Id7 * 23 ^ + Id8 * 23; + + for (int i = 0; i < (int)VertexAttributeDescriptionsCount; i++) + { + hash64 ^= VertexAttributeDescriptions[i].Binding * 23; + hash64 ^= (uint)VertexAttributeDescriptions[i].Format * 23; + hash64 ^= VertexAttributeDescriptions[i].Location * 23; + hash64 ^= VertexAttributeDescriptions[i].Offset * 23; + } + + for (int i = 0; i < (int)VertexBindingDescriptionsCount; i++) + { + hash64 ^= VertexBindingDescriptions[i].Binding * 23; + hash64 ^= (uint)VertexBindingDescriptions[i].InputRate * 23; + hash64 ^= VertexBindingDescriptions[i].Stride * 23; + } + + for (int i = 0; i < (int)ColorBlendAttachmentStateCount; i++) + { + hash64 ^= ColorBlendAttachmentState[i].BlendEnable * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].SrcColorBlendFactor * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].DstColorBlendFactor * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].ColorBlendOp * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].SrcAlphaBlendFactor * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].DstAlphaBlendFactor * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].AlphaBlendOp * 23; + hash64 ^= (uint)ColorBlendAttachmentState[i].ColorWriteMask * 23; + } + + for (int i = 0; i < (int)ColorBlendAttachmentStateCount; i++) + { + hash64 ^= (uint)AttachmentFormats[i] * 23; + } + + return (int)hash64 ^ ((int)(hash64 >> 32) * 17); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/BufferedQuery.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/BufferedQuery.cs new file mode 100644 index 000000000..163c7266f --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/BufferedQuery.cs @@ -0,0 +1,216 @@ +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Runtime.InteropServices; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Queries +{ + class BufferedQuery : IDisposable + { + private const int MaxQueryRetries = 5000; + private const long DefaultValue = unchecked((long)0xFFFFFFFEFFFFFFFE); + private const long DefaultValueInt = 0xFFFFFFFE; + private const ulong HighMask = 0xFFFFFFFF00000000; + + private readonly Vk _api; + private readonly Device _device; + private readonly PipelineFull _pipeline; + + private QueryPool _queryPool; + + private readonly BufferHolder _buffer; + private readonly nint _bufferMap; + private readonly CounterType _type; + private readonly bool _result32Bit; + private readonly bool _isSupported; + + private readonly long _defaultValue; + private int? _resetSequence; + + public unsafe BufferedQuery(VulkanRenderer gd, Device device, PipelineFull pipeline, CounterType type, bool result32Bit) + { + _api = gd.Api; + _device = device; + _pipeline = pipeline; + _type = type; + _result32Bit = result32Bit; + + _isSupported = QueryTypeSupported(gd, type); + + if (_isSupported) + { + QueryPipelineStatisticFlags flags = type == CounterType.PrimitivesGenerated ? + QueryPipelineStatisticFlags.GeometryShaderPrimitivesBit : 0; + + var queryPoolCreateInfo = new QueryPoolCreateInfo + { + SType = StructureType.QueryPoolCreateInfo, + QueryCount = 1, + QueryType = GetQueryType(type), + PipelineStatistics = flags, + }; + + gd.Api.CreateQueryPool(device, in queryPoolCreateInfo, null, out _queryPool).ThrowOnError(); + } + + var buffer = gd.BufferManager.Create(gd, sizeof(long), forConditionalRendering: true); + + _bufferMap = buffer.Map(0, sizeof(long)); + _defaultValue = result32Bit ? DefaultValueInt : DefaultValue; + Marshal.WriteInt64(_bufferMap, _defaultValue); + _buffer = buffer; + } + + private static bool QueryTypeSupported(VulkanRenderer gd, CounterType type) + { + return type switch + { + CounterType.SamplesPassed => true, + CounterType.PrimitivesGenerated => gd.Capabilities.SupportsPipelineStatisticsQuery, + CounterType.TransformFeedbackPrimitivesWritten => gd.Capabilities.SupportsTransformFeedbackQueries, + _ => false, + }; + } + + private static QueryType GetQueryType(CounterType type) + { + return type switch + { + CounterType.SamplesPassed => QueryType.Occlusion, + CounterType.PrimitivesGenerated => QueryType.PipelineStatistics, + CounterType.TransformFeedbackPrimitivesWritten => QueryType.TransformFeedbackStreamExt, + _ => QueryType.Occlusion, + }; + } + + public Auto GetBuffer() + { + return _buffer.GetBuffer(); + } + + public void Reset() + { + End(false); + Begin(null); + } + + public void Begin(int? resetSequence) + { + if (_isSupported) + { + bool needsReset = resetSequence == null || _resetSequence == null || resetSequence.Value != _resetSequence.Value; + bool isOcclusion = _type == CounterType.SamplesPassed; + _pipeline.BeginQuery(this, _queryPool, needsReset, isOcclusion, isOcclusion && resetSequence != null); + } + _resetSequence = null; + } + + public void End(bool withResult) + { + if (_isSupported) + { + _pipeline.EndQuery(_queryPool); + } + + if (withResult && _isSupported) + { + Marshal.WriteInt64(_bufferMap, _defaultValue); + _pipeline.CopyQueryResults(this); + } + else + { + // Dummy result, just return 0. + Marshal.WriteInt64(_bufferMap, 0); + } + } + + private bool WaitingForValue(long data) + { + return data == _defaultValue || + (!_result32Bit && ((ulong)data & HighMask) == ((ulong)_defaultValue & HighMask)); + } + + public bool TryGetResult(out long result) + { + result = Marshal.ReadInt64(_bufferMap); + + return result != _defaultValue; + } + + public long AwaitResult(AutoResetEvent wakeSignal = null) + { + long data = _defaultValue; + + if (wakeSignal == null) + { + while (WaitingForValue(data)) + { + data = Marshal.ReadInt64(_bufferMap); + } + } + else + { + int iterations = 0; + while (WaitingForValue(data) && iterations++ < MaxQueryRetries) + { + data = Marshal.ReadInt64(_bufferMap); + if (WaitingForValue(data)) + { + wakeSignal.WaitOne(1); + } + } + + if (iterations >= MaxQueryRetries) + { + Logger.Error?.Print(LogClass.Gpu, $"Error: Query result {_type} timed out. Took more than {MaxQueryRetries} tries."); + } + } + + return data; + } + + public void PoolReset(CommandBuffer cmd, int resetSequence) + { + if (_isSupported) + { + _api.CmdResetQueryPool(cmd, _queryPool, 0, 1); + } + + _resetSequence = resetSequence; + } + + public void PoolCopy(CommandBufferScoped cbs) + { + var buffer = _buffer.GetBuffer(cbs.CommandBuffer, true).Get(cbs, 0, sizeof(long), true).Value; + + QueryResultFlags flags = QueryResultFlags.ResultWaitBit; + + if (!_result32Bit) + { + flags |= QueryResultFlags.Result64Bit; + } + + _api.CmdCopyQueryPoolResults( + cbs.CommandBuffer, + _queryPool, + 0, + 1, + buffer, + 0, + (ulong)(_result32Bit ? sizeof(int) : sizeof(long)), + flags); + } + + public unsafe void Dispose() + { + _buffer.Dispose(); + if (_isSupported) + { + _api.DestroyQueryPool(_device, _queryPool, null); + } + _queryPool = default; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueue.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueue.cs new file mode 100644 index 000000000..5a1865230 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueue.cs @@ -0,0 +1,252 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Queries +{ + class CounterQueue : IDisposable + { + private const int QueryPoolInitialSize = 100; + + private readonly VulkanRenderer _gd; + private readonly Device _device; + private readonly PipelineFull _pipeline; + + public CounterType Type { get; } + public bool Disposed { get; private set; } + + private readonly Queue _events = new(); + private CounterQueueEvent _current; + + private ulong _accumulatedCounter; + private int _waiterCount; + + private readonly Lock _lock = new(); + + private readonly Queue _queryPool; + private readonly AutoResetEvent _queuedEvent = new(false); + private readonly AutoResetEvent _wakeSignal = new(false); + private readonly AutoResetEvent _eventConsumed = new(false); + + private readonly Thread _consumerThread; + + public int ResetSequence { get; private set; } + + internal CounterQueue(VulkanRenderer gd, Device device, PipelineFull pipeline, CounterType type) + { + _gd = gd; + _device = device; + _pipeline = pipeline; + + Type = type; + + _queryPool = new Queue(QueryPoolInitialSize); + for (int i = 0; i < QueryPoolInitialSize; i++) + { + // AMD Polaris GPUs on Windows seem to have issues reporting 64-bit query results. + _queryPool.Enqueue(new BufferedQuery(_gd, _device, _pipeline, type, gd.IsAmdWindows)); + } + + _current = new CounterQueueEvent(this, type, 0); + + _consumerThread = new Thread(EventConsumer) { Name = "CPU.CounterQueue." + (int)type }; + _consumerThread.Start(); + } + + public void ResetCounterPool() + { + ResetSequence++; + } + + public void ResetFutureCounters(CommandBuffer cmd, int count) + { + // Pre-emptively reset queries to avoid render pass splitting. + lock (_queryPool) + { + count = Math.Min(count, _queryPool.Count); + + if (count > 0) + { + foreach (BufferedQuery query in _queryPool) + { + query.PoolReset(cmd, ResetSequence); + + if (--count == 0) + { + break; + } + } + } + } + } + + private void EventConsumer() + { + while (!Disposed) + { + CounterQueueEvent evt = null; + lock (_lock) + { + if (_events.Count > 0) + { + evt = _events.Dequeue(); + } + } + + if (evt == null) + { + _queuedEvent.WaitOne(); // No more events to go through, wait for more. + } + else + { + // Spin-wait rather than sleeping if there are any waiters, by passing null instead of the wake signal. + evt.TryConsume(ref _accumulatedCounter, true, _waiterCount == 0 ? _wakeSignal : null); + } + + if (_waiterCount > 0) + { + _eventConsumed.Set(); + } + } + } + + internal BufferedQuery GetQueryObject() + { + // Creating/disposing query objects on a context we're sharing with will cause issues. + // So instead, make a lot of query objects on the main thread and reuse them. + + lock (_lock) + { + if (_queryPool.Count > 0) + { + BufferedQuery result = _queryPool.Dequeue(); + return result; + } + + return new BufferedQuery(_gd, _device, _pipeline, Type, _gd.IsAmdWindows); + } + } + + internal void ReturnQueryObject(BufferedQuery query) + { + lock (_lock) + { + // The query will be reset when it dequeues. + _queryPool.Enqueue(query); + } + } + + public CounterQueueEvent QueueReport(EventHandler resultHandler, float divisor, ulong lastDrawIndex, bool hostReserved) + { + CounterQueueEvent result; + ulong draws = lastDrawIndex - _current.DrawIndex; + + lock (_lock) + { + // A query's result only matters if more than one draw was performed during it. + // Otherwise, dummy it out and return 0 immediately. + + if (hostReserved) + { + // This counter event is guaranteed to be available for host conditional rendering. + _current.ReserveForHostAccess(); + } + + _current.Complete(draws > 0 && Type != CounterType.TransformFeedbackPrimitivesWritten, divisor); + _events.Enqueue(_current); + + _current.OnResult += resultHandler; + + result = _current; + + _current = new CounterQueueEvent(this, Type, lastDrawIndex); + } + + _queuedEvent.Set(); + + return result; + } + + public void QueueReset(ulong lastDrawIndex) + { + ulong draws = lastDrawIndex - _current.DrawIndex; + + lock (_lock) + { + _current.Clear(draws != 0); + } + } + + public void Flush(bool blocking) + { + if (!blocking) + { + // Just wake the consumer thread - it will update the queries. + _wakeSignal.Set(); + return; + } + + lock (_lock) + { + // Tell the queue to process all events. + while (_events.Count > 0) + { + CounterQueueEvent flush = _events.Peek(); + if (!flush.TryConsume(ref _accumulatedCounter, true)) + { + return; // If not blocking, then return when we encounter an event that is not ready yet. + } + _events.Dequeue(); + } + } + } + + public void FlushTo(CounterQueueEvent evt) + { + // Flush the counter queue on the main thread. + Interlocked.Increment(ref _waiterCount); + + _wakeSignal.Set(); + + while (!evt.Disposed) + { + _eventConsumed.WaitOne(1); + } + + Interlocked.Decrement(ref _waiterCount); + } + + public void Dispose() + { + lock (_lock) + { + while (_events.Count > 0) + { + CounterQueueEvent evt = _events.Dequeue(); + + evt.Dispose(); + } + + Disposed = true; + } + + _queuedEvent.Set(); + + _consumerThread.Join(); + + _current?.Dispose(); + + foreach (BufferedQuery query in _queryPool) + { + query.Dispose(); + } + + _queuedEvent.Dispose(); + _wakeSignal.Dispose(); + _eventConsumed.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueueEvent.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueueEvent.cs new file mode 100644 index 000000000..12ed42f9c --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/CounterQueueEvent.cs @@ -0,0 +1,170 @@ +using Ryujinx.Graphics.GAL; +using System; +using System.Threading; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Queries +{ + class CounterQueueEvent : ICounterEvent + { + public event EventHandler OnResult; + + public CounterType Type { get; } + public bool ClearCounter { get; private set; } + + public bool Disposed { get; private set; } + public bool Invalid { get; set; } + + public ulong DrawIndex { get; } + + private readonly CounterQueue _queue; + private readonly BufferedQuery _counter; + + private bool _hostAccessReserved; + private int _refCount = 1; // Starts with a reference from the counter queue. + + private readonly Lock _lock = new(); + private ulong _result = ulong.MaxValue; + private double _divisor = 1f; + + public CounterQueueEvent(CounterQueue queue, CounterType type, ulong drawIndex) + { + _queue = queue; + + _counter = queue.GetQueryObject(); + Type = type; + + DrawIndex = drawIndex; + + _counter.Begin(_queue.ResetSequence); + } + + public Auto GetBuffer() + { + return _counter.GetBuffer(); + } + + internal void Clear(bool counterReset) + { + if (counterReset) + { + _counter.Reset(); + } + + ClearCounter = true; + } + + internal void Complete(bool withResult, double divisor) + { + _counter.End(withResult); + + _divisor = divisor; + } + + internal bool TryConsume(ref ulong result, bool block, AutoResetEvent wakeSignal = null) + { + lock (_lock) + { + if (Disposed) + { + return true; + } + + if (ClearCounter) + { + result = 0; + } + + long queryResult; + + if (block) + { + queryResult = _counter.AwaitResult(wakeSignal); + } + else + { + if (!_counter.TryGetResult(out queryResult)) + { + return false; + } + } + + result += _divisor == 1 ? (ulong)queryResult : (ulong)Math.Ceiling(queryResult / _divisor); + + _result = result; + + OnResult?.Invoke(this, result); + + Dispose(); // Return the our resources to the pool. + + return true; + } + } + + public void Flush() + { + if (Disposed) + { + return; + } + + // Tell the queue to process all events up to this one. + _queue.FlushTo(this); + } + + public void DecrementRefCount() + { + if (Interlocked.Decrement(ref _refCount) == 0) + { + DisposeInternal(); + } + } + + public bool ReserveForHostAccess() + { + if (_hostAccessReserved) + { + return true; + } + + if (IsValueAvailable()) + { + return false; + } + + if (Interlocked.Increment(ref _refCount) == 1) + { + Interlocked.Decrement(ref _refCount); + + return false; + } + + _hostAccessReserved = true; + + return true; + } + + public void ReleaseHostAccess() + { + _hostAccessReserved = false; + + DecrementRefCount(); + } + + private void DisposeInternal() + { + _queue.ReturnQueryObject(_counter); + } + + private bool IsValueAvailable() + { + return _result != ulong.MaxValue || _counter.TryGetResult(out _); + } + + public void Dispose() + { + Disposed = true; + + DecrementRefCount(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/Counters.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/Counters.cs new file mode 100644 index 000000000..3e221cc70 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Queries/Counters.cs @@ -0,0 +1,71 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan.Queries +{ + class Counters : IDisposable + { + private readonly CounterQueue[] _counterQueues; + private readonly PipelineFull _pipeline; + + public Counters(VulkanRenderer gd, Device device, PipelineFull pipeline) + { + _pipeline = pipeline; + + int count = Enum.GetNames().Length; + + _counterQueues = new CounterQueue[count]; + + for (int index = 0; index < _counterQueues.Length; index++) + { + CounterType type = (CounterType)index; + _counterQueues[index] = new CounterQueue(gd, device, pipeline, type); + } + } + + public void ResetCounterPool() + { + foreach (var queue in _counterQueues) + { + queue.ResetCounterPool(); + } + } + + public void ResetFutureCounters(CommandBuffer cmd, int count) + { + _counterQueues[(int)CounterType.SamplesPassed].ResetFutureCounters(cmd, count); + } + + public CounterQueueEvent QueueReport(CounterType type, EventHandler resultHandler, float divisor, bool hostReserved) + { + return _counterQueues[(int)type].QueueReport(resultHandler, divisor, _pipeline.DrawCount, hostReserved); + } + + public void QueueReset(CounterType type) + { + _counterQueues[(int)type].QueueReset(_pipeline.DrawCount); + } + + public void Update() + { + foreach (var queue in _counterQueues) + { + queue.Flush(false); + } + } + + public void Flush(CounterType type) + { + _counterQueues[(int)type].Flush(true); + } + + public void Dispose() + { + foreach (var queue in _counterQueues) + { + queue.Dispose(); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassCacheKey.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassCacheKey.cs new file mode 100644 index 000000000..ab490411d --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassCacheKey.cs @@ -0,0 +1,43 @@ +using System; +using System.Linq; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal readonly struct RenderPassCacheKey : IRefEquatable + { + private readonly TextureView _depthStencil; + private readonly TextureView[] _colors; + + public RenderPassCacheKey(TextureView depthStencil, TextureView[] colors) + { + _depthStencil = depthStencil; + _colors = colors; + } + + public override int GetHashCode() + { + HashCode hc = new(); + + hc.Add(_depthStencil); + + if (_colors != null) + { + foreach (var color in _colors) + { + hc.Add(color); + } + } + + return hc.ToHashCode(); + } + + public bool Equals(ref RenderPassCacheKey other) + { + bool colorsNull = _colors == null; + bool otherNull = other._colors == null; + return other._depthStencil == _depthStencil && + colorsNull == otherNull && + (colorsNull || other._colors.SequenceEqual(_colors)); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassHolder.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassHolder.cs new file mode 100644 index 000000000..2f14002ff --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/RenderPassHolder.cs @@ -0,0 +1,221 @@ +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class RenderPassHolder + { + private readonly struct FramebufferCacheKey : IRefEquatable + { + private readonly uint _width; + private readonly uint _height; + private readonly uint _layers; + + public FramebufferCacheKey(uint width, uint height, uint layers) + { + _width = width; + _height = height; + _layers = layers; + } + + public override int GetHashCode() + { + return HashCode.Combine(_width, _height, _layers); + } + + public bool Equals(ref FramebufferCacheKey other) + { + return other._width == _width && other._height == _height && other._layers == _layers; + } + } + + private readonly record struct ForcedFence(TextureStorage Texture, PipelineStageFlags StageFlags); + + private readonly TextureView[] _textures; + private readonly Auto _renderPass; + private readonly HashTableSlim> _framebuffers; + private readonly RenderPassCacheKey _key; + private readonly List _forcedFences; + + public unsafe RenderPassHolder(VulkanRenderer gd, Device device, RenderPassCacheKey key, FramebufferParams fb) + { + // Create render pass using framebuffer params. + + const int MaxAttachments = Constants.MaxRenderTargets + 1; + + AttachmentDescription[] attachmentDescs = null; + + var subpass = new SubpassDescription + { + PipelineBindPoint = PipelineBindPoint.Graphics, + }; + + AttachmentReference* attachmentReferences = stackalloc AttachmentReference[MaxAttachments]; + + var hasFramebuffer = fb != null; + + if (hasFramebuffer && fb.AttachmentsCount != 0) + { + attachmentDescs = new AttachmentDescription[fb.AttachmentsCount]; + + for (int i = 0; i < fb.AttachmentsCount; i++) + { + attachmentDescs[i] = new AttachmentDescription( + 0, + fb.AttachmentFormats[i], + TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, fb.AttachmentSamples[i]), + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + ImageLayout.General, + ImageLayout.General); + } + + int colorAttachmentsCount = fb.ColorAttachmentsCount; + + if (colorAttachmentsCount > MaxAttachments - 1) + { + colorAttachmentsCount = MaxAttachments - 1; + } + + if (colorAttachmentsCount != 0) + { + int maxAttachmentIndex = fb.MaxColorAttachmentIndex; + subpass.ColorAttachmentCount = (uint)maxAttachmentIndex + 1; + subpass.PColorAttachments = &attachmentReferences[0]; + + // Fill with VK_ATTACHMENT_UNUSED to cover any gaps. + for (int i = 0; i <= maxAttachmentIndex; i++) + { + subpass.PColorAttachments[i] = new AttachmentReference(Vk.AttachmentUnused, ImageLayout.Undefined); + } + + for (int i = 0; i < colorAttachmentsCount; i++) + { + int bindIndex = fb.AttachmentIndices[i]; + + subpass.PColorAttachments[bindIndex] = new AttachmentReference((uint)i, ImageLayout.General); + } + } + + if (fb.HasDepthStencil) + { + uint dsIndex = (uint)fb.AttachmentsCount - 1; + + subpass.PDepthStencilAttachment = &attachmentReferences[MaxAttachments - 1]; + *subpass.PDepthStencilAttachment = new AttachmentReference(dsIndex, ImageLayout.General); + } + } + + var subpassDependency = PipelineConverter.CreateSubpassDependency(gd); + + fixed (AttachmentDescription* pAttachmentDescs = attachmentDescs) + { + var renderPassCreateInfo = new RenderPassCreateInfo + { + SType = StructureType.RenderPassCreateInfo, + PAttachments = pAttachmentDescs, + AttachmentCount = attachmentDescs != null ? (uint)attachmentDescs.Length : 0, + PSubpasses = &subpass, + SubpassCount = 1, + PDependencies = &subpassDependency, + DependencyCount = 1, + }; + + gd.Api.CreateRenderPass(device, in renderPassCreateInfo, null, out var renderPass).ThrowOnError(); + + _renderPass = new Auto(new DisposableRenderPass(gd.Api, device, renderPass)); + } + + _framebuffers = new HashTableSlim>(); + + // Register this render pass with all render target views. + + var textures = fb.GetAttachmentViews(); + + foreach (var texture in textures) + { + texture.AddRenderPass(key, this); + } + + _textures = textures; + _key = key; + + _forcedFences = new List(); + } + + public Auto GetFramebuffer(VulkanRenderer gd, CommandBufferScoped cbs, FramebufferParams fb) + { + var key = new FramebufferCacheKey(fb.Width, fb.Height, fb.Layers); + + if (!_framebuffers.TryGetValue(ref key, out Auto result)) + { + result = fb.Create(gd.Api, cbs, _renderPass); + + _framebuffers.Add(ref key, result); + } + + return result; + } + + public Auto GetRenderPass() + { + return _renderPass; + } + + public void AddForcedFence(TextureStorage storage, PipelineStageFlags stageFlags) + { + if (!_forcedFences.Any(fence => fence.Texture == storage)) + { + _forcedFences.Add(new ForcedFence(storage, stageFlags)); + } + } + + public void InsertForcedFences(CommandBufferScoped cbs) + { + if (_forcedFences.Count > 0) + { + _forcedFences.RemoveAll((entry) => + { + if (entry.Texture.Disposed) + { + return true; + } + + entry.Texture.QueueWriteToReadBarrier(cbs, AccessFlags.ShaderReadBit, entry.StageFlags); + + return false; + }); + } + } + + public bool ContainsAttachment(TextureStorage storage) + { + return _textures.Any(view => view.Storage == storage); + } + + public void Dispose() + { + // Dispose all framebuffers. + + foreach (var fb in _framebuffers.Values) + { + fb.Dispose(); + } + + // Notify all texture views that this render pass has been disposed. + + foreach (var texture in _textures) + { + texture.RemoveRenderPass(_key); + } + + // Dispose render pass. + + _renderPass.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceArray.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceArray.cs new file mode 100644 index 000000000..341c3c9b1 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceArray.cs @@ -0,0 +1,81 @@ +using Silk.NET.Vulkan; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class ResourceArray : IDisposable + { + private DescriptorSet[] _cachedDescriptorSets; + + private ShaderCollection _cachedDscProgram; + private int _cachedDscSetIndex; + private int _cachedDscIndex; + + private int _bindCount; + + protected void SetDirty(VulkanRenderer gd, bool isImage) + { + ReleaseDescriptorSet(); + + if (_bindCount != 0) + { + if (isImage) + { + gd.PipelineInternal.ForceImageDirty(); + } + else + { + gd.PipelineInternal.ForceTextureDirty(); + } + } + } + + public bool TryGetCachedDescriptorSets(CommandBufferScoped cbs, ShaderCollection program, int setIndex, out DescriptorSet[] sets) + { + if (_cachedDescriptorSets != null) + { + _cachedDscProgram.UpdateManualDescriptorSetCollectionOwnership(cbs, _cachedDscSetIndex, _cachedDscIndex); + + sets = _cachedDescriptorSets; + + return true; + } + + var dsc = program.GetNewManualDescriptorSetCollection(cbs, setIndex, out _cachedDscIndex).Get(cbs); + + sets = dsc.GetSets(); + + _cachedDescriptorSets = sets; + _cachedDscProgram = program; + _cachedDscSetIndex = setIndex; + + return false; + } + + public void IncrementBindCount() + { + _bindCount++; + } + + public void DecrementBindCount() + { + int newBindCount = --_bindCount; + Debug.Assert(newBindCount >= 0); + } + + private void ReleaseDescriptorSet() + { + if (_cachedDescriptorSets != null) + { + _cachedDscProgram.ReleaseManualDescriptorSetCollection(_cachedDscSetIndex, _cachedDscIndex); + _cachedDescriptorSets = null; + } + } + + public void Dispose() + { + ReleaseDescriptorSet(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceBindingSegment.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceBindingSegment.cs new file mode 100644 index 000000000..4d640f5aa --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceBindingSegment.cs @@ -0,0 +1,22 @@ +using Ryujinx.Graphics.GAL; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct ResourceBindingSegment + { + public readonly int Binding; + public readonly int Count; + public readonly ResourceType Type; + public readonly ResourceStages Stages; + public readonly bool IsArray; + + public ResourceBindingSegment(int binding, int count, ResourceType type, ResourceStages stages, bool isArray) + { + Binding = binding; + Count = count; + Type = type; + Stages = stages; + IsArray = isArray; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceLayoutBuilder.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceLayoutBuilder.cs new file mode 100644 index 000000000..bdc20d95c --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/ResourceLayoutBuilder.cs @@ -0,0 +1,57 @@ +using Ryujinx.Graphics.GAL; +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class ResourceLayoutBuilder + { + private const int TotalSets = PipelineBase.DescriptorSetLayouts; + + private readonly List[] _resourceDescriptors; + private readonly List[] _resourceUsages; + + public ResourceLayoutBuilder() + { + _resourceDescriptors = new List[TotalSets]; + _resourceUsages = new List[TotalSets]; + + for (int index = 0; index < TotalSets; index++) + { + _resourceDescriptors[index] = new(); + _resourceUsages[index] = new(); + } + } + + public ResourceLayoutBuilder Add(ResourceStages stages, ResourceType type, int binding, bool write = false) + { + int setIndex = type switch + { + ResourceType.UniformBuffer => PipelineBase.UniformSetIndex, + ResourceType.StorageBuffer => PipelineBase.StorageSetIndex, + ResourceType.TextureAndSampler or ResourceType.BufferTexture => PipelineBase.TextureSetIndex, + ResourceType.Image or ResourceType.BufferImage => PipelineBase.ImageSetIndex, + _ => throw new ArgumentException($"Invalid resource type \"{type}\"."), + }; + + _resourceDescriptors[setIndex].Add(new ResourceDescriptor(binding, 1, type, stages)); + _resourceUsages[setIndex].Add(new ResourceUsage(binding, 1, type, stages, write)); + + return this; + } + + public ResourceLayout Build() + { + var descriptors = new ResourceDescriptorCollection[TotalSets]; + var usages = new ResourceUsageCollection[TotalSets]; + + for (int index = 0; index < TotalSets; index++) + { + descriptors[index] = new ResourceDescriptorCollection(_resourceDescriptors[index].ToArray().AsReadOnly()); + usages[index] = new ResourceUsageCollection(_resourceUsages[index].ToArray().AsReadOnly()); + } + + return new ResourceLayout(descriptors.AsReadOnly(), usages.AsReadOnly()); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Ryujinx.Graphics.Rdna3Vulkan.csproj b/src/Ryujinx.Graphics.Rdna3Vulkan/Ryujinx.Graphics.Rdna3Vulkan.csproj new file mode 100644 index 000000000..9753b0644 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Ryujinx.Graphics.Rdna3Vulkan.csproj @@ -0,0 +1,28 @@ + + + + $(DefaultItemExcludes);._* + + + + true + + + + true + + + + + + + + + + + + + + + + diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/SamplerHolder.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/SamplerHolder.cs new file mode 100644 index 000000000..cb2d55b1c --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/SamplerHolder.cs @@ -0,0 +1,120 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class SamplerHolder : ISampler + { + private readonly VulkanRenderer _gd; + private readonly Auto _sampler; + + public unsafe SamplerHolder(VulkanRenderer gd, Device device, SamplerCreateInfo info) + { + _gd = gd; + + gd.Samplers.Add(this); + + (Filter minFilter, SamplerMipmapMode mipFilter) = info.MinFilter.Convert(); + + float minLod = info.MinLod; + float maxLod = info.MaxLod; + + if (info.MinFilter == MinFilter.Nearest || info.MinFilter == MinFilter.Linear) + { + minLod = 0; + maxLod = 0.25f; + } + + var borderColor = GetConstrainedBorderColor(info.BorderColor, out var cantConstrain); + + var samplerCreateInfo = new Silk.NET.Vulkan.SamplerCreateInfo + { + SType = StructureType.SamplerCreateInfo, + MagFilter = info.MagFilter.Convert(), + MinFilter = minFilter, + MipmapMode = mipFilter, + AddressModeU = info.AddressU.Convert(), + AddressModeV = info.AddressV.Convert(), + AddressModeW = info.AddressP.Convert(), + MipLodBias = info.MipLodBias, + AnisotropyEnable = info.MaxAnisotropy != 1f, + MaxAnisotropy = info.MaxAnisotropy, + CompareEnable = info.CompareMode == CompareMode.CompareRToTexture, + CompareOp = info.CompareOp.Convert(), + MinLod = minLod, + MaxLod = maxLod, + BorderColor = borderColor, + UnnormalizedCoordinates = false, // TODO: Use unnormalized coordinates. + }; + + SamplerCustomBorderColorCreateInfoEXT customBorderColor; + + if (cantConstrain && gd.Capabilities.SupportsCustomBorderColor) + { + var color = new ClearColorValue( + info.BorderColor.Red, + info.BorderColor.Green, + info.BorderColor.Blue, + info.BorderColor.Alpha); + + customBorderColor = new SamplerCustomBorderColorCreateInfoEXT + { + SType = StructureType.SamplerCustomBorderColorCreateInfoExt, + CustomBorderColor = color, + }; + + samplerCreateInfo.PNext = &customBorderColor; + samplerCreateInfo.BorderColor = BorderColor.FloatCustomExt; + } + + gd.Api.CreateSampler(device, in samplerCreateInfo, null, out var sampler).ThrowOnError(); + + _sampler = new Auto(new DisposableSampler(gd.Api, device, sampler)); + } + + private static BorderColor GetConstrainedBorderColor(ColorF arbitraryBorderColor, out bool cantConstrain) + { + float r = arbitraryBorderColor.Red; + float g = arbitraryBorderColor.Green; + float b = arbitraryBorderColor.Blue; + float a = arbitraryBorderColor.Alpha; + + if (r == 0f && g == 0f && b == 0f) + { + if (a == 1f) + { + cantConstrain = false; + return BorderColor.FloatOpaqueBlack; + } + + if (a == 0f) + { + cantConstrain = false; + return BorderColor.FloatTransparentBlack; + } + } + else if (r == 1f && g == 1f && b == 1f && a == 1f) + { + cantConstrain = false; + return BorderColor.FloatOpaqueWhite; + } + + cantConstrain = true; + return BorderColor.FloatOpaqueBlack; + } + + public Auto GetSampler() + { + return _sampler; + } + + public void Dispose() + { + if (_gd.Samplers.Remove(this)) + { + _sampler.Dispose(); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Shader.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Shader.cs new file mode 100644 index 000000000..e73052ba8 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Shader.cs @@ -0,0 +1,161 @@ +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using shaderc; +using Silk.NET.Vulkan; +using System; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class Shader : IDisposable + { + // The shaderc.net dependency's Options constructor and dispose are not thread safe. + // Take this lock when using them. + private static readonly Lock _shaderOptionsLock = new(); + + private static readonly nint _ptrMainEntryPointName = Marshal.StringToHGlobalAnsi("main"); + + private readonly Vk _api; + private readonly Device _device; + private readonly ShaderStageFlags _stage; + + private bool _disposed; + private ShaderModule _module; + + public ShaderStageFlags StageFlags => _stage; + + public ProgramLinkStatus CompileStatus { private set; get; } + + public readonly Task CompileTask; + + public unsafe Shader(Vk api, Device device, ShaderSource shaderSource) + { + _api = api; + _device = device; + + CompileStatus = ProgramLinkStatus.Incomplete; + + _stage = shaderSource.Stage.Convert(); + + CompileTask = Task.Run(() => + { + byte[] spirv = shaderSource.BinaryCode; + + if (spirv == null) + { + spirv = GlslToSpirv(shaderSource.Code, shaderSource.Stage); + + if (spirv == null) + { + CompileStatus = ProgramLinkStatus.Failure; + + return; + } + } + + fixed (byte* pCode = spirv) + { + var shaderModuleCreateInfo = new ShaderModuleCreateInfo + { + SType = StructureType.ShaderModuleCreateInfo, + CodeSize = (uint)spirv.Length, + PCode = (uint*)pCode, + }; + + api.CreateShaderModule(device, in shaderModuleCreateInfo, null, out _module).ThrowOnError(); + } + + CompileStatus = ProgramLinkStatus.Success; + }); + } + + private unsafe static byte[] GlslToSpirv(string glsl, ShaderStage stage) + { + Options options; + + lock (_shaderOptionsLock) + { + options = new Options(false) + { + SourceLanguage = SourceLanguage.Glsl, + TargetSpirVVersion = new SpirVVersion(1, 5), + }; + } + + options.SetTargetEnvironment(TargetEnvironment.Vulkan, EnvironmentVersion.Vulkan_1_2); + Compiler compiler = new(options); + var scr = compiler.Compile(glsl, "Ryu", GetShaderCShaderStage(stage)); + + lock (_shaderOptionsLock) + { + options.Dispose(); + } + + if (scr.Status != Status.Success) + { + Logger.Error?.Print(LogClass.Gpu, $"Shader compilation error: {scr.Status} {scr.ErrorMessage}"); + + return null; + } + + var spirvBytes = new Span((void*)scr.CodePointer, (int)scr.CodeLength); + + byte[] code = new byte[(scr.CodeLength + 3) & ~3]; + + spirvBytes.CopyTo(code.AsSpan()[..(int)scr.CodeLength]); + + return code; + } + + private static ShaderKind GetShaderCShaderStage(ShaderStage stage) + { + switch (stage) + { + case ShaderStage.Vertex: + return ShaderKind.GlslVertexShader; + case ShaderStage.Geometry: + return ShaderKind.GlslGeometryShader; + case ShaderStage.TessellationControl: + return ShaderKind.GlslTessControlShader; + case ShaderStage.TessellationEvaluation: + return ShaderKind.GlslTessEvaluationShader; + case ShaderStage.Fragment: + return ShaderKind.GlslFragmentShader; + case ShaderStage.Compute: + return ShaderKind.GlslComputeShader; + } + + Logger.Debug?.Print(LogClass.Gpu, $"Invalid {nameof(ShaderStage)} enum value: {stage}."); + + return ShaderKind.GlslVertexShader; + } + + public unsafe PipelineShaderStageCreateInfo GetInfo() + { + return new PipelineShaderStageCreateInfo + { + SType = StructureType.PipelineShaderStageCreateInfo, + Stage = _stage, + Module = _module, + PName = (byte*)_ptrMainEntryPointName, + }; + } + + public void WaitForCompile() + { + CompileTask.Wait(); + } + + public unsafe void Dispose() + { + if (!_disposed) + { + _api.DestroyShaderModule(_device, _module, null); + _disposed = true; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/ShaderCollection.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/ShaderCollection.cs new file mode 100644 index 000000000..f103f5009 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/ShaderCollection.cs @@ -0,0 +1,767 @@ +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Linq; +using System.Threading.Tasks; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class ShaderCollection : IProgram + { + private readonly PipelineShaderStageCreateInfo[] _infos; + private readonly Shader[] _shaders; + + private readonly PipelineLayoutCacheEntry _plce; + + public PipelineLayout PipelineLayout => _plce.PipelineLayout; + + public bool HasMinimalLayout { get; } + public bool UsePushDescriptors { get; } + public bool IsCompute { get; } + public bool HasTessellationControlShader => (Stages & (1u << 3)) != 0; + + public bool UpdateTexturesWithoutTemplate { get; } + + public uint Stages { get; } + + public PipelineStageFlags IncoherentBufferWriteStages { get; } + public PipelineStageFlags IncoherentTextureWriteStages { get; } + + public ResourceBindingSegment[][] ClearSegments { get; } + public ResourceBindingSegment[][] BindingSegments { get; } + public DescriptorSetTemplate[] Templates { get; } + + public ProgramLinkStatus LinkStatus { get; private set; } + + public readonly SpecDescription[] SpecDescriptions; + + public bool IsLinked + { + get + { + if (LinkStatus == ProgramLinkStatus.Incomplete) + { + CheckProgramLink(true); + } + + return LinkStatus == ProgramLinkStatus.Success; + } + } + + private HashTableSlim> _graphicsPipelineCache; + private HashTableSlim> _computePipelineCache; + + private readonly VulkanRenderer _gd; + private Device _device; + private bool _initialized; + + private ProgramPipelineState _state; + private DisposableRenderPass _dummyRenderPass; + private readonly Task _compileTask; + private bool _firstBackgroundUse; + + public ShaderCollection( + VulkanRenderer gd, + Device device, + ShaderSource[] shaders, + ResourceLayout resourceLayout, + SpecDescription[] specDescription = null, + bool isMinimal = false) + { + _gd = gd; + _device = device; + + if (specDescription != null && specDescription.Length != shaders.Length) + { + throw new ArgumentException($"{nameof(specDescription)} array length must match {nameof(shaders)} array if provided"); + } + + gd.Shaders.Add(this); + + var internalShaders = new Shader[shaders.Length]; + + _infos = new PipelineShaderStageCreateInfo[shaders.Length]; + + SpecDescriptions = specDescription; + + LinkStatus = ProgramLinkStatus.Incomplete; + + uint stages = 0; + + for (int i = 0; i < shaders.Length; i++) + { + var shader = new Shader(gd.Api, device, shaders[i]); + + stages |= 1u << shader.StageFlags switch + { + ShaderStageFlags.FragmentBit => 1, + ShaderStageFlags.GeometryBit => 2, + ShaderStageFlags.TessellationControlBit => 3, + ShaderStageFlags.TessellationEvaluationBit => 4, + _ => 0, + }; + + if (shader.StageFlags == ShaderStageFlags.ComputeBit) + { + IsCompute = true; + } + + internalShaders[i] = shader; + } + + _shaders = internalShaders; + + bool usePushDescriptors = !isMinimal && + VulkanConfiguration.UsePushDescriptors && + _gd.Capabilities.SupportsPushDescriptors && + !IsCompute && + !HasPushDescriptorsBug(gd) && + CanUsePushDescriptors(gd, resourceLayout, IsCompute); + + ReadOnlyCollection sets = usePushDescriptors ? + BuildPushDescriptorSets(gd, resourceLayout.Sets) : resourceLayout.Sets; + + _plce = gd.PipelineLayoutCache.GetOrCreate(gd, device, sets, usePushDescriptors); + + HasMinimalLayout = isMinimal; + UsePushDescriptors = usePushDescriptors; + + Stages = stages; + + ClearSegments = BuildClearSegments(sets); + BindingSegments = BuildBindingSegments(resourceLayout.SetUsages, out bool usesBufferTextures); + Templates = BuildTemplates(usePushDescriptors); + (IncoherentBufferWriteStages, IncoherentTextureWriteStages) = BuildIncoherentStages(resourceLayout.SetUsages); + + // Updating buffer texture bindings using template updates crashes the Adreno driver on Windows. + UpdateTexturesWithoutTemplate = gd.IsQualcommProprietary && usesBufferTextures; + + _compileTask = Task.CompletedTask; + _firstBackgroundUse = false; + } + + public ShaderCollection( + VulkanRenderer gd, + Device device, + ShaderSource[] sources, + ResourceLayout resourceLayout, + ProgramPipelineState state, + bool fromCache) : this(gd, device, sources, resourceLayout) + { + _state = state; + + _compileTask = BackgroundCompilation(); + _firstBackgroundUse = !fromCache; + } + + private static bool HasPushDescriptorsBug(VulkanRenderer gd) + { + // Those GPUs/drivers do not work properly with push descriptors, so we must force disable them. + return gd.IsNvidiaPreTuring || (gd.IsIntelArc && gd.IsIntelWindows); + } + + private static bool CanUsePushDescriptors(VulkanRenderer gd, ResourceLayout layout, bool isCompute) + { + // If binding 3 is immediately used, use an alternate set of reserved bindings. + ReadOnlyCollection uniformUsage = layout.SetUsages[0].Usages; + bool hasBinding3 = uniformUsage.Any(x => x.Binding == 3); + int[] reserved = isCompute ? Array.Empty() : gd.GetPushDescriptorReservedBindings(hasBinding3); + + // Can't use any of the reserved usages. + for (int i = 0; i < uniformUsage.Count; i++) + { + var binding = uniformUsage[i].Binding; + + if (reserved.Contains(binding) || + binding >= Constants.MaxPushDescriptorBinding || + binding >= gd.Capabilities.MaxPushDescriptors + reserved.Count(id => id < binding)) + { + return false; + } + } + + //Prevent the sum of descriptors from exceeding MaxPushDescriptors + int totalDescriptors = 0; + foreach (ResourceDescriptor desc in layout.Sets.First().Descriptors) + { + if (!reserved.Contains(desc.Binding)) + totalDescriptors += desc.Count; + } + if (totalDescriptors > gd.Capabilities.MaxPushDescriptors) + return false; + + return true; + } + + private static ReadOnlyCollection BuildPushDescriptorSets( + VulkanRenderer gd, + ReadOnlyCollection sets) + { + // The reserved bindings were selected when determining if push descriptors could be used. + int[] reserved = gd.GetPushDescriptorReservedBindings(false); + + var result = new ResourceDescriptorCollection[sets.Count]; + + for (int i = 0; i < sets.Count; i++) + { + if (i == 0) + { + // Push descriptors apply here. Remove reserved bindings. + ResourceDescriptorCollection original = sets[i]; + + var pdUniforms = new ResourceDescriptor[original.Descriptors.Count]; + int j = 0; + + foreach (ResourceDescriptor descriptor in original.Descriptors) + { + if (reserved.Contains(descriptor.Binding)) + { + // If the binding is reserved, set its descriptor count to 0. + pdUniforms[j++] = new ResourceDescriptor( + descriptor.Binding, + 0, + descriptor.Type, + descriptor.Stages); + } + else + { + pdUniforms[j++] = descriptor; + } + } + + result[i] = new ResourceDescriptorCollection(new(pdUniforms)); + } + else + { + result[i] = sets[i]; + } + } + + return new(result); + } + + private static ResourceBindingSegment[][] BuildClearSegments(ReadOnlyCollection sets) + { + ResourceBindingSegment[][] segments = new ResourceBindingSegment[sets.Count][]; + + for (int setIndex = 0; setIndex < sets.Count; setIndex++) + { + List currentSegments = new(); + + ResourceDescriptor currentDescriptor = default; + int currentCount = 0; + + for (int index = 0; index < sets[setIndex].Descriptors.Count; index++) + { + ResourceDescriptor descriptor = sets[setIndex].Descriptors[index]; + + if (currentDescriptor.Binding + currentCount != descriptor.Binding || + currentDescriptor.Type != descriptor.Type || + currentDescriptor.Stages != descriptor.Stages || + currentDescriptor.Count > 1 || + descriptor.Count > 1) + { + if (currentCount != 0) + { + currentSegments.Add(new ResourceBindingSegment( + currentDescriptor.Binding, + currentCount, + currentDescriptor.Type, + currentDescriptor.Stages, + currentDescriptor.Count > 1)); + } + + currentDescriptor = descriptor; + currentCount = descriptor.Count; + } + else + { + currentCount += descriptor.Count; + } + } + + if (currentCount != 0) + { + currentSegments.Add(new ResourceBindingSegment( + currentDescriptor.Binding, + currentCount, + currentDescriptor.Type, + currentDescriptor.Stages, + currentDescriptor.Count > 1)); + } + + segments[setIndex] = currentSegments.ToArray(); + } + + return segments; + } + + private static ResourceBindingSegment[][] BuildBindingSegments(ReadOnlyCollection setUsages, out bool usesBufferTextures) + { + usesBufferTextures = false; + + ResourceBindingSegment[][] segments = new ResourceBindingSegment[setUsages.Count][]; + + for (int setIndex = 0; setIndex < setUsages.Count; setIndex++) + { + List currentSegments = new(); + + ResourceUsage currentUsage = default; + int currentCount = 0; + + for (int index = 0; index < setUsages[setIndex].Usages.Count; index++) + { + ResourceUsage usage = setUsages[setIndex].Usages[index]; + + if (usage.Type == ResourceType.BufferTexture) + { + usesBufferTextures = true; + } + + if (currentUsage.Binding + currentCount != usage.Binding || + currentUsage.Type != usage.Type || + currentUsage.Stages != usage.Stages || + currentUsage.ArrayLength > 1 || + usage.ArrayLength > 1) + { + if (currentCount != 0) + { + currentSegments.Add(new ResourceBindingSegment( + currentUsage.Binding, + currentCount, + currentUsage.Type, + currentUsage.Stages, + currentUsage.ArrayLength > 1)); + } + + currentUsage = usage; + currentCount = usage.ArrayLength; + } + else + { + currentCount++; + } + } + + if (currentCount != 0) + { + currentSegments.Add(new ResourceBindingSegment( + currentUsage.Binding, + currentCount, + currentUsage.Type, + currentUsage.Stages, + currentUsage.ArrayLength > 1)); + } + + segments[setIndex] = currentSegments.ToArray(); + } + + return segments; + } + + private DescriptorSetTemplate[] BuildTemplates(bool usePushDescriptors) + { + var templates = new DescriptorSetTemplate[BindingSegments.Length]; + + for (int setIndex = 0; setIndex < BindingSegments.Length; setIndex++) + { + if (usePushDescriptors && setIndex == 0) + { + // Push descriptors get updated using templates owned by the pipeline layout. + continue; + } + + ResourceBindingSegment[] segments = BindingSegments[setIndex]; + + if (segments != null && segments.Length > 0) + { + templates[setIndex] = new DescriptorSetTemplate( + _gd, + _device, + segments, + _plce, + IsCompute ? PipelineBindPoint.Compute : PipelineBindPoint.Graphics, + setIndex); + } + } + + return templates; + } + + private PipelineStageFlags GetPipelineStages(ResourceStages stages) + { + PipelineStageFlags result = 0; + + if ((stages & ResourceStages.Compute) != 0) + { + result |= PipelineStageFlags.ComputeShaderBit; + } + + if ((stages & ResourceStages.Vertex) != 0) + { + result |= PipelineStageFlags.VertexShaderBit; + } + + if ((stages & ResourceStages.Fragment) != 0) + { + result |= PipelineStageFlags.FragmentShaderBit; + } + + if ((stages & ResourceStages.Geometry) != 0) + { + result |= PipelineStageFlags.GeometryShaderBit; + } + + if ((stages & ResourceStages.TessellationControl) != 0) + { + result |= PipelineStageFlags.TessellationControlShaderBit; + } + + if ((stages & ResourceStages.TessellationEvaluation) != 0) + { + result |= PipelineStageFlags.TessellationEvaluationShaderBit; + } + + return result; + } + + private (PipelineStageFlags Buffer, PipelineStageFlags Texture) BuildIncoherentStages(ReadOnlyCollection setUsages) + { + PipelineStageFlags buffer = PipelineStageFlags.None; + PipelineStageFlags texture = PipelineStageFlags.None; + + foreach (var set in setUsages) + { + foreach (var range in set.Usages) + { + if (range.Write) + { + PipelineStageFlags stages = GetPipelineStages(range.Stages); + + switch (range.Type) + { + case ResourceType.Image: + texture |= stages; + break; + case ResourceType.StorageBuffer: + case ResourceType.BufferImage: + buffer |= stages; + break; + } + } + } + } + + return (buffer, texture); + } + + private async Task BackgroundCompilation() + { + await Task.WhenAll(_shaders.Select(shader => shader.CompileTask)); + + if (Array.Exists(_shaders, shader => shader.CompileStatus == ProgramLinkStatus.Failure)) + { + LinkStatus = ProgramLinkStatus.Failure; + + return; + } + + try + { + if (IsCompute) + { + CreateBackgroundComputePipeline(); + } + else + { + CreateBackgroundGraphicsPipeline(); + } + } + catch (VulkanException e) + { + Logger.Error?.PrintMsg(LogClass.Gpu, $"Background Compilation failed: {e.Message}"); + + LinkStatus = ProgramLinkStatus.Failure; + } + } + + private void EnsureShadersReady() + { + if (!_initialized) + { + CheckProgramLink(true); + + ProgramLinkStatus resultStatus = ProgramLinkStatus.Success; + + for (int i = 0; i < _shaders.Length; i++) + { + var shader = _shaders[i]; + + if (shader.CompileStatus != ProgramLinkStatus.Success) + { + resultStatus = ProgramLinkStatus.Failure; + } + + _infos[i] = shader.GetInfo(); + } + + // If the link status was already set as failure by background compilation, prefer that decision. + if (LinkStatus != ProgramLinkStatus.Failure) + { + LinkStatus = resultStatus; + } + + _initialized = true; + } + } + + public PipelineShaderStageCreateInfo[] GetInfos() + { + EnsureShadersReady(); + + return _infos; + } + + protected DisposableRenderPass CreateDummyRenderPass() + { + if (_dummyRenderPass.Value.Handle != 0) + { + return _dummyRenderPass; + } + + return _dummyRenderPass = _state.ToRenderPass(_gd, _device); + } + + public void CreateBackgroundComputePipeline() + { + PipelineState pipeline = new(); + pipeline.Initialize(); + + pipeline.Stages[0] = _shaders[0].GetInfo(); + pipeline.StagesCount = 1; + pipeline.PipelineLayout = PipelineLayout; + + pipeline.CreateComputePipeline(_gd, _device, this, (_gd.Pipeline as PipelineBase).PipelineCache); + pipeline.Dispose(); + } + + public void CreateBackgroundGraphicsPipeline() + { + // To compile shaders in the background in Vulkan, we need to create valid pipelines using the shader modules. + // The GPU provides pipeline state via the GAL that can be converted into our internal Vulkan pipeline state. + // This should match the pipeline state at the time of the first draw. If it doesn't, then it'll likely be + // close enough that the GPU driver will reuse the compiled shader for the different state. + + // First, we need to create a render pass object compatible with the one that will be used at runtime. + // The active attachment formats have been provided by the abstraction layer. + var renderPass = CreateDummyRenderPass(); + + PipelineState pipeline = _state.ToVulkanPipelineState(_gd); + + // Copy the shader stage info to the pipeline. + var stages = pipeline.Stages.AsSpan(); + + for (int i = 0; i < _shaders.Length; i++) + { + stages[i] = _shaders[i].GetInfo(); + } + + pipeline.HasTessellationControlShader = HasTessellationControlShader; + pipeline.StagesCount = (uint)_shaders.Length; + pipeline.PipelineLayout = PipelineLayout; + + pipeline.CreateGraphicsPipeline(_gd, _device, this, (_gd.Pipeline as PipelineBase).PipelineCache, renderPass.Value, throwOnError: true); + pipeline.Dispose(); + } + + public ProgramLinkStatus CheckProgramLink(bool blocking) + { + if (LinkStatus == ProgramLinkStatus.Incomplete) + { + ProgramLinkStatus resultStatus = ProgramLinkStatus.Success; + + foreach (Shader shader in _shaders) + { + if (shader.CompileStatus == ProgramLinkStatus.Incomplete) + { + if (blocking) + { + // Wait for this shader to finish compiling. + shader.WaitForCompile(); + + if (shader.CompileStatus != ProgramLinkStatus.Success) + { + resultStatus = ProgramLinkStatus.Failure; + } + } + else + { + return ProgramLinkStatus.Incomplete; + } + } + } + + if (!_compileTask.IsCompleted) + { + if (blocking) + { + _compileTask.Wait(); + + if (LinkStatus == ProgramLinkStatus.Failure) + { + return ProgramLinkStatus.Failure; + } + } + else + { + return ProgramLinkStatus.Incomplete; + } + } + + return resultStatus; + } + + return LinkStatus; + } + + public byte[] GetBinary() + { + return null; + } + + public DescriptorSetTemplate GetPushDescriptorTemplate(long updateMask) + { + return _plce.GetPushDescriptorTemplate(IsCompute ? PipelineBindPoint.Compute : PipelineBindPoint.Graphics, updateMask); + } + + public void AddComputePipeline(ref SpecData key, Auto pipeline) + { + (_computePipelineCache ??= new()).Add(ref key, pipeline); + } + + public void AddGraphicsPipeline(ref PipelineUid key, Auto pipeline) + { + (_graphicsPipelineCache ??= new()).Add(ref key, pipeline); + } + + public bool TryGetComputePipeline(ref SpecData key, out Auto pipeline) + { + if (_computePipelineCache == null) + { + pipeline = default; + return false; + } + + if (_computePipelineCache.TryGetValue(ref key, out pipeline)) + { + return true; + } + + return false; + } + + public bool TryGetGraphicsPipeline(ref PipelineUid key, out Auto pipeline) + { + if (_graphicsPipelineCache == null) + { + pipeline = default; + return false; + } + + if (!_graphicsPipelineCache.TryGetValue(ref key, out pipeline)) + { + if (_firstBackgroundUse) + { + Logger.Warning?.Print(LogClass.Gpu, "Background pipeline compile missed on draw - incorrect pipeline state?"); + _firstBackgroundUse = false; + } + + return false; + } + + _firstBackgroundUse = false; + + return true; + } + + public void UpdateDescriptorCacheCommandBufferIndex(int commandBufferIndex) + { + _plce.UpdateCommandBufferIndex(commandBufferIndex); + } + + public Auto GetNewDescriptorSetCollection(int setIndex, out bool isNew) + { + return _plce.GetNewDescriptorSetCollection(setIndex, out isNew); + } + + public Auto GetNewManualDescriptorSetCollection(CommandBufferScoped cbs, int setIndex, out int cacheIndex) + { + return _plce.GetNewManualDescriptorSetCollection(cbs, setIndex, out cacheIndex); + } + + public void UpdateManualDescriptorSetCollectionOwnership(CommandBufferScoped cbs, int setIndex, int cacheIndex) + { + _plce.UpdateManualDescriptorSetCollectionOwnership(cbs, setIndex, cacheIndex); + } + + public void ReleaseManualDescriptorSetCollection(int setIndex, int cacheIndex) + { + _plce.ReleaseManualDescriptorSetCollection(setIndex, cacheIndex); + } + + public bool HasSameLayout(ShaderCollection other) + { + return other != null && _plce == other._plce; + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + if (!_gd.Shaders.Remove(this)) + { + return; + } + + for (int i = 0; i < _shaders.Length; i++) + { + _shaders[i].Dispose(); + } + + if (_graphicsPipelineCache != null) + { + foreach (Auto pipeline in _graphicsPipelineCache.Values) + { + pipeline?.Dispose(); + } + } + + if (_computePipelineCache != null) + { + foreach (Auto pipeline in _computePipelineCache.Values) + { + pipeline.Dispose(); + } + } + + for (int i = 0; i < Templates.Length; i++) + { + Templates[i]?.Dispose(); + } + + if (_dummyRenderPass.Value.Handle != 0) + { + _dummyRenderPass.Dispose(); + } + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/SpecInfo.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/SpecInfo.cs new file mode 100644 index 000000000..f0a4ae3b6 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/SpecInfo.cs @@ -0,0 +1,100 @@ +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + public enum SpecConstType + { + Bool32, + Int16, + Int32, + Int64, + Float16, + Float32, + Float64, + } + + sealed class SpecDescription + { + public readonly SpecializationInfo Info; + public readonly SpecializationMapEntry[] Map; + + // For mapping a simple packed struct or single entry + public SpecDescription(params (uint Id, SpecConstType Type)[] description) + { + int count = description.Length; + Map = new SpecializationMapEntry[count]; + + uint structSize = 0; + + for (int i = 0; i < Map.Length; ++i) + { + var typeSize = SizeOf(description[i].Type); + Map[i] = new SpecializationMapEntry(description[i].Id, structSize, typeSize); + structSize += typeSize; + } + + Info = new SpecializationInfo + { + DataSize = structSize, + MapEntryCount = (uint)count, + }; + } + + // For advanced mapping with overlapping or staggered fields + public SpecDescription(SpecializationMapEntry[] map) + { + Map = map; + + uint structSize = 0; + for (int i = 0; i < map.Length; ++i) + { + structSize = Math.Max(structSize, map[i].Offset + (uint)map[i].Size); + } + + Info = new SpecializationInfo + { + DataSize = structSize, + MapEntryCount = (uint)map.Length, + }; + } + + private static uint SizeOf(SpecConstType type) => type switch + { + SpecConstType.Int16 or SpecConstType.Float16 => 2, + SpecConstType.Bool32 or SpecConstType.Int32 or SpecConstType.Float32 => 4, + SpecConstType.Int64 or SpecConstType.Float64 => 8, + _ => throw new ArgumentOutOfRangeException(nameof(type)), + }; + + private SpecDescription() + { + Info = new(); + } + + public static readonly SpecDescription Empty = new(); + } + + readonly struct SpecData : IRefEquatable + { + private readonly byte[] _data; + private readonly int _hash; + + public int Length => _data.Length; + public ReadOnlySpan Span => _data.AsSpan(); + public override int GetHashCode() => _hash; + + public SpecData(ReadOnlySpan data) + { + _data = new byte[data.Length]; + data.CopyTo(_data); + + var hc = new HashCode(); + hc.AddBytes(data); + _hash = hc.ToHashCode(); + } + + public override bool Equals(object obj) => obj is SpecData other && Equals(other); + public bool Equals(ref SpecData other) => _data.AsSpan().SequenceEqual(other._data); + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/StagingBuffer.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/StagingBuffer.cs new file mode 100644 index 000000000..e6ab8b1ec --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/StagingBuffer.cs @@ -0,0 +1,297 @@ +using Ryujinx.Common; +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using System; +using System.Collections.Generic; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct StagingBufferReserved + { + public readonly BufferHolder Buffer; + public readonly int Offset; + public readonly int Size; + + public StagingBufferReserved(BufferHolder buffer, int offset, int size) + { + Buffer = buffer; + Offset = offset; + Size = size; + } + } + + class StagingBuffer : IDisposable + { + private const int BufferSize = 32 * 1024 * 1024; + + private int _freeOffset; + private int _freeSize; + + private readonly VulkanRenderer _gd; + private readonly BufferHolder _buffer; + private readonly int _resourceAlignment; + + public readonly BufferHandle Handle; + + private readonly struct PendingCopy + { + public FenceHolder Fence { get; } + public int Size { get; } + + public PendingCopy(FenceHolder fence, int size) + { + Fence = fence; + Size = size; + fence.Get(); + } + } + + private readonly Queue _pendingCopies; + + public StagingBuffer(VulkanRenderer gd, BufferManager bufferManager) + { + _gd = gd; + Handle = bufferManager.CreateWithHandle(gd, BufferSize, out _buffer); + _pendingCopies = new Queue(); + _freeSize = BufferSize; + _resourceAlignment = (int)gd.Capabilities.MinResourceAlignment; + } + + public void PushData(CommandBufferPool cbp, CommandBufferScoped? cbs, Action endRenderPass, BufferHolder dst, int dstOffset, ReadOnlySpan data) + { + bool isRender = cbs != null; + CommandBufferScoped scoped = cbs ?? cbp.Rent(); + + // Must push all data to the buffer. If it can't fit, split it up. + + endRenderPass?.Invoke(); + + while (data.Length > 0) + { + if (_freeSize < data.Length) + { + FreeCompleted(); + } + + while (_freeSize == 0) + { + if (!WaitFreeCompleted(cbp)) + { + if (isRender) + { + _gd.FlushAllCommands(); + scoped = cbp.Rent(); + isRender = false; + } + else + { + scoped = cbp.ReturnAndRent(scoped); + } + } + } + + int chunkSize = Math.Min(_freeSize, data.Length); + + PushDataImpl(scoped, dst, dstOffset, data[..chunkSize]); + + dstOffset += chunkSize; + data = data[chunkSize..]; + } + + if (!isRender) + { + scoped.Dispose(); + } + } + + private void PushDataImpl(CommandBufferScoped cbs, BufferHolder dst, int dstOffset, ReadOnlySpan data) + { + var srcBuffer = _buffer.GetBuffer(); + var dstBuffer = dst.GetBuffer(cbs.CommandBuffer, dstOffset, data.Length, true); + + int offset = _freeOffset; + int capacity = BufferSize - offset; + if (capacity < data.Length) + { + _buffer.SetDataUnchecked(offset, data[..capacity]); + _buffer.SetDataUnchecked(0, data[capacity..]); + + BufferHolder.Copy(_gd, cbs, srcBuffer, dstBuffer, offset, dstOffset, capacity); + BufferHolder.Copy(_gd, cbs, srcBuffer, dstBuffer, 0, dstOffset + capacity, data.Length - capacity); + } + else + { + _buffer.SetDataUnchecked(offset, data); + + BufferHolder.Copy(_gd, cbs, srcBuffer, dstBuffer, offset, dstOffset, data.Length); + } + + _freeOffset = (offset + data.Length) & (BufferSize - 1); + _freeSize -= data.Length; + Debug.Assert(_freeSize >= 0); + + _pendingCopies.Enqueue(new PendingCopy(cbs.GetFence(), data.Length)); + } + + public bool TryPushData(CommandBufferScoped cbs, Action endRenderPass, BufferHolder dst, int dstOffset, ReadOnlySpan data) + { + if (data.Length > BufferSize) + { + return false; + } + + if (_freeSize < data.Length) + { + FreeCompleted(); + + if (_freeSize < data.Length) + { + return false; + } + } + + endRenderPass?.Invoke(); + + PushDataImpl(cbs, dst, dstOffset, data); + + return true; + } + + private StagingBufferReserved ReserveDataImpl(CommandBufferScoped cbs, int size, int alignment) + { + // Assumes the caller has already determined that there is enough space. + int offset = BitUtils.AlignUp(_freeOffset, alignment); + int padding = offset - _freeOffset; + + int capacity = Math.Min(_freeSize, BufferSize - offset); + int reservedLength = size + padding; + if (capacity < size) + { + offset = 0; // Place at start. + reservedLength += capacity; + } + + _freeOffset = (_freeOffset + reservedLength) & (BufferSize - 1); + _freeSize -= reservedLength; + Debug.Assert(_freeSize >= 0); + + _pendingCopies.Enqueue(new PendingCopy(cbs.GetFence(), reservedLength)); + + return new StagingBufferReserved(_buffer, offset, size); + } + + private int GetContiguousFreeSize(int alignment) + { + int alignedFreeOffset = BitUtils.AlignUp(_freeOffset, alignment); + int padding = alignedFreeOffset - _freeOffset; + + // Free regions: + // - Aligned free offset to end (minimum free size - padding) + // - 0 to _freeOffset + freeSize wrapped (only if free area contains 0) + + int endOffset = (_freeOffset + _freeSize) & (BufferSize - 1); + + return Math.Max( + Math.Min(_freeSize - padding, BufferSize - alignedFreeOffset), + endOffset <= _freeOffset ? Math.Min(_freeSize, endOffset) : 0 + ); + } + + /// + /// Reserve a range on the staging buffer for the current command buffer and upload data to it. + /// + /// Command buffer to reserve the data on + /// The minimum size the reserved data requires + /// The required alignment for the buffer offset + /// The reserved range of the staging buffer + public unsafe StagingBufferReserved? TryReserveData(CommandBufferScoped cbs, int size, int alignment) + { + if (size > BufferSize) + { + return null; + } + + // Temporary reserved data cannot be fragmented. + + if (GetContiguousFreeSize(alignment) < size) + { + FreeCompleted(); + + if (GetContiguousFreeSize(alignment) < size) + { + Logger.Debug?.PrintMsg(LogClass.Gpu, $"Staging buffer out of space to reserve data of size {size}."); + return null; + } + } + + return ReserveDataImpl(cbs, size, alignment); + } + + /// + /// Reserve a range on the staging buffer for the current command buffer and upload data to it. + /// Uses the most permissive byte alignment. + /// + /// Command buffer to reserve the data on + /// The minimum size the reserved data requires + /// The reserved range of the staging buffer + public unsafe StagingBufferReserved? TryReserveData(CommandBufferScoped cbs, int size) + { + return TryReserveData(cbs, size, _resourceAlignment); + } + + private bool WaitFreeCompleted(CommandBufferPool cbp) + { + if (_pendingCopies.TryPeek(out var pc)) + { + if (!pc.Fence.IsSignaled()) + { + if (cbp.IsFenceOnRentedCommandBuffer(pc.Fence)) + { + return false; + } + + pc.Fence.Wait(); + } + + var dequeued = _pendingCopies.Dequeue(); + Debug.Assert(dequeued.Fence == pc.Fence); + _freeSize += pc.Size; + pc.Fence.Put(); + } + + return true; + } + + public void FreeCompleted() + { + FenceHolder signalledFence = null; + while (_pendingCopies.TryPeek(out var pc) && (pc.Fence == signalledFence || pc.Fence.IsSignaled())) + { + signalledFence = pc.Fence; // Already checked - don't need to do it again. + var dequeued = _pendingCopies.Dequeue(); + Debug.Assert(dequeued.Fence == pc.Fence); + _freeSize += pc.Size; + pc.Fence.Put(); + } + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + _gd.BufferManager.Delete(Handle); + + while (_pendingCopies.TryDequeue(out var pc)) + { + pc.Fence.Put(); + } + } + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/SyncManager.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/SyncManager.cs new file mode 100644 index 000000000..42133651e --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/SyncManager.cs @@ -0,0 +1,215 @@ +using Ryujinx.Common.Logging; +using Silk.NET.Vulkan; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class SyncManager + { + private class SyncHandle + { + public ulong ID; + public MultiFenceHolder Waitable; + public ulong FlushId; + public bool Signalled; + + public bool NeedsFlush(ulong currentFlushId) + { + return (long)(FlushId - currentFlushId) >= 0; + } + } + + private ulong _firstHandle; + + private readonly VulkanRenderer _gd; + private readonly Device _device; + private readonly List _handles; + private ulong _flushId; + private long _waitTicks; + + public SyncManager(VulkanRenderer gd, Device device) + { + _gd = gd; + _device = device; + _handles = new List(); + } + + public void RegisterFlush() + { + _flushId++; + } + + public void Create(ulong id, bool strict) + { + ulong flushId = _flushId; + MultiFenceHolder waitable = new(); + if (strict || _gd.InterruptAction == null) + { + _gd.FlushAllCommands(); + _gd.CommandBufferPool.AddWaitable(waitable); + } + else + { + // Don't flush commands, instead wait for the current command buffer to finish. + // If this sync is waited on before the command buffer is submitted, interrupt the gpu thread and flush it manually. + + _gd.CommandBufferPool.AddInUseWaitable(waitable); + } + + SyncHandle handle = new() + { + ID = id, + Waitable = waitable, + FlushId = flushId, + }; + + lock (_handles) + { + _handles.Add(handle); + } + } + + public ulong GetCurrent() + { + lock (_handles) + { + ulong lastHandle = _firstHandle; + + foreach (SyncHandle handle in _handles) + { + lock (handle) + { + if (handle.Waitable == null) + { + continue; + } + + if (handle.ID > lastHandle) + { + bool signaled = handle.Signalled || handle.Waitable.WaitForFences(_gd.Api, _device, 0); + if (signaled) + { + lastHandle = handle.ID; + handle.Signalled = true; + } + } + } + } + + return lastHandle; + } + } + + public void Wait(ulong id) + { + SyncHandle result = null; + + lock (_handles) + { + if ((long)(_firstHandle - id) > 0) + { + return; // The handle has already been signalled or deleted. + } + + foreach (SyncHandle handle in _handles) + { + if (handle.ID == id) + { + result = handle; + break; + } + } + } + + if (result != null) + { + if (result.Waitable == null) + { + return; + } + + long beforeTicks = Stopwatch.GetTimestamp(); + + if (result.NeedsFlush(_flushId)) + { + _gd.InterruptAction(() => + { + if (result.NeedsFlush(_flushId)) + { + _gd.FlushAllCommands(); + } + }); + } + + lock (result) + { + if (result.Waitable == null) + { + return; + } + + bool signaled = result.Signalled || result.Waitable.WaitForFences(_gd.Api, _device, 1000000000); + + if (!signaled) + { + Logger.Error?.PrintMsg(LogClass.Gpu, $"VK Sync Object {result.ID} failed to signal within 1000ms. Continuing..."); + } + else + { + _waitTicks += Stopwatch.GetTimestamp() - beforeTicks; + result.Signalled = true; + } + } + } + } + + public void Cleanup() + { + // Iterate through handles and remove any that have already been signalled. + + while (true) + { + SyncHandle first = null; + lock (_handles) + { + first = _handles.FirstOrDefault(); + } + + if (first == null || first.NeedsFlush(_flushId)) + { + break; + } + + bool signaled = first.Waitable.WaitForFences(_gd.Api, _device, 0); + if (signaled) + { + // Delete the sync object. + lock (_handles) + { + lock (first) + { + _firstHandle = first.ID + 1; + _handles.RemoveAt(0); + first.Waitable = null; + } + } + } + else + { + // This sync handle and any following have not been reached yet. + break; + } + } + } + + public long GetAndResetWaitTicks() + { + long result = _waitTicks; + _waitTicks = 0; + + return result; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/TextureArray.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureArray.cs new file mode 100644 index 000000000..79624dd8b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureArray.cs @@ -0,0 +1,234 @@ +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class TextureArray : ResourceArray, ITextureArray + { + private readonly VulkanRenderer _gd; + + private struct TextureRef + { + public TextureStorage Storage; + public Auto View; + public Auto Sampler; + } + + private readonly TextureRef[] _textureRefs; + private readonly TextureBuffer[] _bufferTextureRefs; + + private readonly DescriptorImageInfo[] _textures; + private readonly BufferView[] _bufferTextures; + + private HashSet _storages; + + private int _cachedCommandBufferIndex; + private int _cachedSubmissionCount; + + private readonly bool _isBuffer; + + public TextureArray(VulkanRenderer gd, int size, bool isBuffer) + { + _gd = gd; + + if (isBuffer) + { + _bufferTextureRefs = new TextureBuffer[size]; + _bufferTextures = new BufferView[size]; + } + else + { + _textureRefs = new TextureRef[size]; + _textures = new DescriptorImageInfo[size]; + } + + _storages = null; + + _cachedCommandBufferIndex = -1; + _cachedSubmissionCount = 0; + + _isBuffer = isBuffer; + } + + public void SetSamplers(int index, ISampler[] samplers) + { + for (int i = 0; i < samplers.Length; i++) + { + ISampler sampler = samplers[i]; + + if (sampler is SamplerHolder samplerHolder) + { + _textureRefs[index + i].Sampler = samplerHolder.GetSampler(); + } + else + { + _textureRefs[index + i].Sampler = default; + } + } + + SetDirty(); + } + + public void SetTextures(int index, ITexture[] textures) + { + for (int i = 0; i < textures.Length; i++) + { + ITexture texture = textures[i]; + + if (texture is TextureBuffer textureBuffer) + { + _bufferTextureRefs[index + i] = textureBuffer; + } + else if (texture is TextureView view) + { + _textureRefs[index + i].Storage = view.Storage; + _textureRefs[index + i].View = view.GetImageView(); + } + else if (!_isBuffer) + { + _textureRefs[index + i].Storage = null; + _textureRefs[index + i].View = default; + } + else + { + _bufferTextureRefs[index + i] = null; + } + } + + SetDirty(); + } + + private void SetDirty() + { + _cachedCommandBufferIndex = -1; + _storages = null; + SetDirty(_gd, isImage: false); + } + + public void QueueWriteToReadBarriers(CommandBufferScoped cbs, PipelineStageFlags stageFlags) + { + HashSet storages = _storages; + + if (storages == null) + { + storages = new HashSet(); + + for (int index = 0; index < _textureRefs.Length; index++) + { + if (_textureRefs[index].Storage != null) + { + storages.Add(_textureRefs[index].Storage); + } + } + + _storages = storages; + } + + foreach (TextureStorage storage in storages) + { + storage.QueueWriteToReadBarrier(cbs, AccessFlags.ShaderReadBit, stageFlags); + } + } + + public ReadOnlySpan GetImageInfos(VulkanRenderer gd, CommandBufferScoped cbs, TextureView dummyTexture, SamplerHolder dummySampler) + { + int submissionCount = gd.CommandBufferPool.GetSubmissionCount(cbs.CommandBufferIndex); + + Span textures = _textures; + + if (cbs.CommandBufferIndex == _cachedCommandBufferIndex && submissionCount == _cachedSubmissionCount) + { + return textures; + } + + _cachedCommandBufferIndex = cbs.CommandBufferIndex; + _cachedSubmissionCount = submissionCount; + + for (int i = 0; i < textures.Length; i++) + { + ref var texture = ref textures[i]; + ref var refs = ref _textureRefs[i]; + + if (i > 0 && _textureRefs[i - 1].View == refs.View && _textureRefs[i - 1].Sampler == refs.Sampler) + { + texture = textures[i - 1]; + + continue; + } + + texture.ImageLayout = ImageLayout.General; + texture.ImageView = refs.View?.Get(cbs).Value ?? default; + texture.Sampler = refs.Sampler?.Get(cbs).Value ?? default; + + if (texture.ImageView.Handle == 0) + { + texture.ImageView = dummyTexture.GetImageView().Get(cbs).Value; + } + + if (texture.Sampler.Handle == 0) + { + texture.Sampler = dummySampler.GetSampler().Get(cbs).Value; + } + } + + return textures; + } + + public ReadOnlySpan GetBufferViews(CommandBufferScoped cbs) + { + Span bufferTextures = _bufferTextures; + + for (int i = 0; i < bufferTextures.Length; i++) + { + bufferTextures[i] = _bufferTextureRefs[i]?.GetBufferView(cbs, false) ?? default; + } + + return bufferTextures; + } + + public DescriptorSet[] GetDescriptorSets( + Device device, + CommandBufferScoped cbs, + DescriptorSetTemplateUpdater templateUpdater, + ShaderCollection program, + int setIndex, + TextureView dummyTexture, + SamplerHolder dummySampler) + { + if (TryGetCachedDescriptorSets(cbs, program, setIndex, out DescriptorSet[] sets)) + { + // We still need to ensure the current command buffer holds a reference to all used textures. + + if (!_isBuffer) + { + GetImageInfos(_gd, cbs, dummyTexture, dummySampler); + } + else + { + GetBufferViews(cbs); + } + + return sets; + } + + DescriptorSetTemplate template = program.Templates[setIndex]; + + DescriptorSetTemplateWriter tu = templateUpdater.Begin(template); + + if (!_isBuffer) + { + tu.Push(GetImageInfos(_gd, cbs, dummyTexture, dummySampler)); + } + else + { + tu.Push(GetBufferViews(cbs)); + } + + templateUpdater.Commit(_gd, device, sets[0]); + + return sets; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/TextureBuffer.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureBuffer.cs new file mode 100644 index 000000000..2217ad4d6 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureBuffer.cs @@ -0,0 +1,130 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using Format = Ryujinx.Graphics.GAL.Format; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class TextureBuffer : ITexture + { + private readonly VulkanRenderer _gd; + + private BufferHandle _bufferHandle; + private int _offset; + private int _size; + private Auto _bufferView; + + private int _bufferCount; + + public int Width { get; } + public int Height { get; } + + public VkFormat VkFormat { get; } + + public TextureBuffer(VulkanRenderer gd, TextureCreateInfo info) + { + _gd = gd; + Width = info.Width; + Height = info.Height; + VkFormat = FormatTable.GetFormat(info.Format); + + gd.Textures.Add(this); + } + + public void CopyTo(ITexture destination, int firstLayer, int firstLevel) + { + throw new NotSupportedException(); + } + + public void CopyTo(ITexture destination, int srcLayer, int dstLayer, int srcLevel, int dstLevel) + { + throw new NotSupportedException(); + } + + public void CopyTo(ITexture destination, Extents2D srcRegion, Extents2D dstRegion, bool linearFilter) + { + throw new NotSupportedException(); + } + + public ITexture CreateView(TextureCreateInfo info, int firstLayer, int firstLevel) + { + throw new NotSupportedException(); + } + + public PinnedSpan GetData() + { + return _gd.GetBufferData(_bufferHandle, _offset, _size); + } + + public PinnedSpan GetData(int layer, int level) + { + return GetData(); + } + + public void CopyTo(BufferRange range, int layer, int level, int stride) + { + throw new NotImplementedException(); + } + + public void Release() + { + if (_gd.Textures.Remove(this)) + { + ReleaseImpl(); + } + } + + private void ReleaseImpl() + { + _bufferView?.Dispose(); + _bufferView = null; + } + + /// + public void SetData(MemoryOwner data) + { + _gd.SetBufferData(_bufferHandle, _offset, data.Span); + data.Dispose(); + } + + /// + public void SetData(MemoryOwner data, int layer, int level) + { + throw new NotSupportedException(); + } + + /// + public void SetData(MemoryOwner data, int layer, int level, Rectangle region) + { + throw new NotSupportedException(); + } + + public void SetStorage(BufferRange buffer) + { + if (_bufferHandle == buffer.Handle && + _offset == buffer.Offset && + _size == buffer.Size && + _bufferCount == _gd.BufferManager.BufferCount) + { + return; + } + + _bufferHandle = buffer.Handle; + _offset = buffer.Offset; + _size = buffer.Size; + _bufferCount = _gd.BufferManager.BufferCount; + + ReleaseImpl(); + } + + public BufferView GetBufferView(CommandBufferScoped cbs, bool write) + { + _bufferView ??= _gd.BufferManager.CreateView(_bufferHandle, VkFormat, _offset, _size, ReleaseImpl); + + return _bufferView?.Get(cbs, _offset, _size, write).Value ?? default; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/TextureCopy.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureCopy.cs new file mode 100644 index 000000000..64936fca0 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureCopy.cs @@ -0,0 +1,473 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Numerics; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class TextureCopy + { + public static void Blit( + Vk api, + CommandBuffer commandBuffer, + Image srcImage, + Image dstImage, + TextureCreateInfo srcInfo, + TextureCreateInfo dstInfo, + Extents2D srcRegion, + Extents2D dstRegion, + int srcLayer, + int dstLayer, + int srcLevel, + int dstLevel, + int layers, + int levels, + bool linearFilter, + ImageAspectFlags srcAspectFlags = 0, + ImageAspectFlags dstAspectFlags = 0) + { + static (Offset3D, Offset3D) ExtentsToOffset3D(Extents2D extents, int width, int height, int level) + { + static int Clamp(int value, int max) + { + return Math.Clamp(value, 0, max); + } + + var xy1 = new Offset3D(Clamp(extents.X1, width) >> level, Clamp(extents.Y1, height) >> level, 0); + var xy2 = new Offset3D(Clamp(extents.X2, width) >> level, Clamp(extents.Y2, height) >> level, 1); + + return (xy1, xy2); + } + + if (srcAspectFlags == 0) + { + srcAspectFlags = srcInfo.Format.ConvertAspectFlags(); + } + + if (dstAspectFlags == 0) + { + dstAspectFlags = dstInfo.Format.ConvertAspectFlags(); + } + + var srcOffsets = new ImageBlit.SrcOffsetsBuffer(); + var dstOffsets = new ImageBlit.DstOffsetsBuffer(); + + var filter = linearFilter && !dstInfo.Format.IsDepthOrStencil() ? Filter.Linear : Filter.Nearest; + + TextureView.InsertImageBarrier( + api, + commandBuffer, + srcImage, + TextureStorage.DefaultAccessMask, + AccessFlags.TransferReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + srcAspectFlags, + srcLayer, + srcLevel, + layers, + levels); + + uint copySrcLevel = (uint)srcLevel; + uint copyDstLevel = (uint)dstLevel; + + for (int level = 0; level < levels; level++) + { + var srcSl = new ImageSubresourceLayers(srcAspectFlags, copySrcLevel, (uint)srcLayer, (uint)layers); + var dstSl = new ImageSubresourceLayers(dstAspectFlags, copyDstLevel, (uint)dstLayer, (uint)layers); + + (srcOffsets.Element0, srcOffsets.Element1) = ExtentsToOffset3D(srcRegion, srcInfo.Width, srcInfo.Height, level); + (dstOffsets.Element0, dstOffsets.Element1) = ExtentsToOffset3D(dstRegion, dstInfo.Width, dstInfo.Height, level); + + var region = new ImageBlit + { + SrcSubresource = srcSl, + SrcOffsets = srcOffsets, + DstSubresource = dstSl, + DstOffsets = dstOffsets, + }; + + api.CmdBlitImage(commandBuffer, srcImage, ImageLayout.General, dstImage, ImageLayout.General, 1, in region, filter); + + copySrcLevel++; + copyDstLevel++; + + if (srcInfo.Target == Target.Texture3D || dstInfo.Target == Target.Texture3D) + { + layers = Math.Max(1, layers >> 1); + } + } + + TextureView.InsertImageBarrier( + api, + commandBuffer, + dstImage, + AccessFlags.TransferWriteBit, + TextureStorage.DefaultAccessMask, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + dstAspectFlags, + dstLayer, + dstLevel, + layers, + levels); + } + + public static void Copy( + Vk api, + CommandBuffer commandBuffer, + Image srcImage, + Image dstImage, + TextureCreateInfo srcInfo, + TextureCreateInfo dstInfo, + int srcViewLayer, + int dstViewLayer, + int srcViewLevel, + int dstViewLevel, + int srcLayer, + int dstLayer, + int srcLevel, + int dstLevel) + { + int srcDepth = srcInfo.GetDepthOrLayers(); + int srcLevels = srcInfo.Levels; + + int dstDepth = dstInfo.GetDepthOrLayers(); + int dstLevels = dstInfo.Levels; + + if (dstInfo.Target == Target.Texture3D) + { + dstDepth = Math.Max(1, dstDepth >> dstLevel); + } + + int depth = Math.Min(srcDepth, dstDepth); + int levels = Math.Min(srcLevels, dstLevels); + + Copy( + api, + commandBuffer, + srcImage, + dstImage, + srcInfo, + dstInfo, + srcViewLayer, + dstViewLayer, + srcViewLevel, + dstViewLevel, + srcLayer, + dstLayer, + srcLevel, + dstLevel, + depth, + levels); + } + + private static int ClampLevels(TextureCreateInfo info, int levels) + { + int width = info.Width; + int height = info.Height; + int depth = info.Target == Target.Texture3D ? info.Depth : 1; + + int maxLevels = 1 + BitOperations.Log2((uint)Math.Max(Math.Max(width, height), depth)); + + if (levels > maxLevels) + { + levels = maxLevels; + } + + return levels; + } + + public static void Copy( + Vk api, + CommandBuffer commandBuffer, + Image srcImage, + Image dstImage, + TextureCreateInfo srcInfo, + TextureCreateInfo dstInfo, + int srcViewLayer, + int dstViewLayer, + int srcViewLevel, + int dstViewLevel, + int srcDepthOrLayer, + int dstDepthOrLayer, + int srcLevel, + int dstLevel, + int depthOrLayers, + int levels) + { + int srcZ; + int srcLayer; + int srcDepth; + int srcLayers; + + if (srcInfo.Target == Target.Texture3D) + { + srcZ = srcDepthOrLayer; + srcLayer = 0; + srcDepth = depthOrLayers; + srcLayers = 1; + } + else + { + srcZ = 0; + srcLayer = srcDepthOrLayer; + srcDepth = 1; + srcLayers = depthOrLayers; + } + + int dstZ; + int dstLayer; + int dstLayers; + + if (dstInfo.Target == Target.Texture3D) + { + dstZ = dstDepthOrLayer; + dstLayer = 0; + dstLayers = 1; + } + else + { + dstZ = 0; + dstLayer = dstDepthOrLayer; + dstLayers = depthOrLayers; + } + + int srcWidth = srcInfo.Width; + int srcHeight = srcInfo.Height; + + int dstWidth = dstInfo.Width; + int dstHeight = dstInfo.Height; + + srcWidth = Math.Max(1, srcWidth >> srcLevel); + srcHeight = Math.Max(1, srcHeight >> srcLevel); + + dstWidth = Math.Max(1, dstWidth >> dstLevel); + dstHeight = Math.Max(1, dstHeight >> dstLevel); + + int blockWidth = 1; + int blockHeight = 1; + bool sizeInBlocks = false; + + // When copying from a compressed to a non-compressed format, + // the non-compressed texture will have the size of the texture + // in blocks (not in texels), so we must adjust that size to + // match the size in texels of the compressed texture. + if (!srcInfo.IsCompressed && dstInfo.IsCompressed) + { + srcWidth *= dstInfo.BlockWidth; + srcHeight *= dstInfo.BlockHeight; + blockWidth = dstInfo.BlockWidth; + blockHeight = dstInfo.BlockHeight; + + sizeInBlocks = true; + } + else if (srcInfo.IsCompressed && !dstInfo.IsCompressed) + { + dstWidth *= srcInfo.BlockWidth; + dstHeight *= srcInfo.BlockHeight; + blockWidth = srcInfo.BlockWidth; + blockHeight = srcInfo.BlockHeight; + } + + int width = Math.Min(srcWidth, dstWidth); + int height = Math.Min(srcHeight, dstHeight); + + ImageAspectFlags srcAspect = srcInfo.Format.ConvertAspectFlags(); + ImageAspectFlags dstAspect = dstInfo.Format.ConvertAspectFlags(); + + TextureView.InsertImageBarrier( + api, + commandBuffer, + srcImage, + TextureStorage.DefaultAccessMask, + AccessFlags.TransferReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + srcAspect, + srcViewLayer + srcLayer, + srcViewLevel + srcLevel, + srcLayers, + levels); + + for (int level = 0; level < levels; level++) + { + // Stop copy if we are already out of the levels range. + if (level >= srcInfo.Levels || dstLevel + level >= dstInfo.Levels) + { + break; + } + + var srcSl = new ImageSubresourceLayers( + srcAspect, + (uint)(srcViewLevel + srcLevel + level), + (uint)(srcViewLayer + srcLayer), + (uint)srcLayers); + + var dstSl = new ImageSubresourceLayers( + dstAspect, + (uint)(dstViewLevel + dstLevel + level), + (uint)(dstViewLayer + dstLayer), + (uint)dstLayers); + + int copyWidth = sizeInBlocks ? BitUtils.DivRoundUp(width, blockWidth) : width; + int copyHeight = sizeInBlocks ? BitUtils.DivRoundUp(height, blockHeight) : height; + + var extent = new Extent3D((uint)copyWidth, (uint)copyHeight, (uint)srcDepth); + + if (srcInfo.Samples > 1 && srcInfo.Samples != dstInfo.Samples) + { + var region = new ImageResolve(srcSl, new Offset3D(0, 0, srcZ), dstSl, new Offset3D(0, 0, dstZ), extent); + + api.CmdResolveImage(commandBuffer, srcImage, ImageLayout.General, dstImage, ImageLayout.General, 1, in region); + } + else + { + var region = new ImageCopy(srcSl, new Offset3D(0, 0, srcZ), dstSl, new Offset3D(0, 0, dstZ), extent); + + api.CmdCopyImage(commandBuffer, srcImage, ImageLayout.General, dstImage, ImageLayout.General, 1, in region); + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + + if (srcInfo.Target == Target.Texture3D) + { + srcDepth = Math.Max(1, srcDepth >> 1); + } + } + + TextureView.InsertImageBarrier( + api, + commandBuffer, + dstImage, + AccessFlags.TransferWriteBit, + TextureStorage.DefaultAccessMask, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + dstAspect, + dstViewLayer + dstLayer, + dstViewLevel + dstLevel, + dstLayers, + levels); + } + + public unsafe static void ResolveDepthStencil( + VulkanRenderer gd, + Device device, + CommandBufferScoped cbs, + TextureView src, + TextureView dst) + { + var dsAttachmentReference = new AttachmentReference2(StructureType.AttachmentReference2, null, 0, ImageLayout.General); + var dsResolveAttachmentReference = new AttachmentReference2(StructureType.AttachmentReference2, null, 1, ImageLayout.General); + + var subpassDsResolve = new SubpassDescriptionDepthStencilResolve + { + SType = StructureType.SubpassDescriptionDepthStencilResolve, + PDepthStencilResolveAttachment = &dsResolveAttachmentReference, + DepthResolveMode = ResolveModeFlags.SampleZeroBit, + StencilResolveMode = ResolveModeFlags.SampleZeroBit, + }; + + var subpass = new SubpassDescription2 + { + SType = StructureType.SubpassDescription2, + PipelineBindPoint = PipelineBindPoint.Graphics, + PDepthStencilAttachment = &dsAttachmentReference, + PNext = &subpassDsResolve, + }; + + AttachmentDescription2[] attachmentDescs = new AttachmentDescription2[2]; + + attachmentDescs[0] = new AttachmentDescription2( + StructureType.AttachmentDescription2, + null, + 0, + src.VkFormat, + TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, (uint)src.Info.Samples), + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + ImageLayout.General, + ImageLayout.General); + + attachmentDescs[1] = new AttachmentDescription2( + StructureType.AttachmentDescription2, + null, + 0, + dst.VkFormat, + TextureStorage.ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, (uint)dst.Info.Samples), + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + AttachmentLoadOp.Load, + AttachmentStoreOp.Store, + ImageLayout.General, + ImageLayout.General); + + var subpassDependency = PipelineConverter.CreateSubpassDependency2(gd); + + fixed (AttachmentDescription2* pAttachmentDescs = attachmentDescs) + { + var renderPassCreateInfo = new RenderPassCreateInfo2 + { + SType = StructureType.RenderPassCreateInfo2, + PAttachments = pAttachmentDescs, + AttachmentCount = (uint)attachmentDescs.Length, + PSubpasses = &subpass, + SubpassCount = 1, + PDependencies = &subpassDependency, + DependencyCount = 1, + }; + + gd.Api.CreateRenderPass2(device, in renderPassCreateInfo, null, out var renderPass).ThrowOnError(); + + using var rp = new Auto(new DisposableRenderPass(gd.Api, device, renderPass)); + + ImageView* attachments = stackalloc ImageView[2]; + + var srcView = src.GetImageViewForAttachment(); + var dstView = dst.GetImageViewForAttachment(); + + attachments[0] = srcView.Get(cbs).Value; + attachments[1] = dstView.Get(cbs).Value; + + var framebufferCreateInfo = new FramebufferCreateInfo + { + SType = StructureType.FramebufferCreateInfo, + RenderPass = rp.Get(cbs).Value, + AttachmentCount = 2, + PAttachments = attachments, + Width = (uint)src.Width, + Height = (uint)src.Height, + Layers = (uint)src.Layers, + }; + + gd.Api.CreateFramebuffer(device, in framebufferCreateInfo, null, out var framebuffer).ThrowOnError(); + using var fb = new Auto(new DisposableFramebuffer(gd.Api, device, framebuffer), null, srcView, dstView); + + var renderArea = new Rect2D(null, new Extent2D((uint)src.Info.Width, (uint)src.Info.Height)); + var clearValue = new ClearValue(); + + var renderPassBeginInfo = new RenderPassBeginInfo + { + SType = StructureType.RenderPassBeginInfo, + RenderPass = rp.Get(cbs).Value, + Framebuffer = fb.Get(cbs).Value, + RenderArea = renderArea, + PClearValues = &clearValue, + ClearValueCount = 1, + }; + + // The resolve operation happens at the end of the subpass, so let's just do a begin/end + // to resolve the depth-stencil texture. + // TODO: Do speculative resolve and part of the same render pass as the draw to avoid + // ending the current render pass? + gd.Api.CmdBeginRenderPass(cbs.CommandBuffer, in renderPassBeginInfo, SubpassContents.Inline); + gd.Api.CmdEndRenderPass(cbs.CommandBuffer); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/TextureStorage.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureStorage.cs new file mode 100644 index 000000000..794190817 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureStorage.cs @@ -0,0 +1,618 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Numerics; +using System.Runtime.CompilerServices; +using Format = Ryujinx.Graphics.GAL.Format; +using VkBuffer = Silk.NET.Vulkan.Buffer; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class TextureStorage : IDisposable + { + private struct TextureSliceInfo + { + public int BindCount; + } + + private const MemoryPropertyFlags DefaultImageMemoryFlags = + MemoryPropertyFlags.DeviceLocalBit; + + private const ImageUsageFlags DefaultUsageFlags = + ImageUsageFlags.SampledBit | + ImageUsageFlags.TransferSrcBit | + ImageUsageFlags.TransferDstBit; + + public const AccessFlags DefaultAccessMask = + AccessFlags.ShaderReadBit | + AccessFlags.ShaderWriteBit | + AccessFlags.ColorAttachmentReadBit | + AccessFlags.ColorAttachmentWriteBit | + AccessFlags.DepthStencilAttachmentReadBit | + AccessFlags.DepthStencilAttachmentWriteBit | + AccessFlags.TransferReadBit | + AccessFlags.TransferWriteBit; + + private readonly VulkanRenderer _gd; + + private readonly Device _device; + + private TextureCreateInfo _info; + + public TextureCreateInfo Info => _info; + + public bool Disposed { get; private set; } + + private readonly Image _image; + private readonly Auto _imageAuto; + private readonly Auto _allocationAuto; + private readonly int _depthOrLayers; + private Auto _foreignAllocationAuto; + + private Dictionary _aliasedStorages; + + private AccessFlags _lastModificationAccess; + private PipelineStageFlags _lastModificationStage; + private AccessFlags _lastReadAccess; + private PipelineStageFlags _lastReadStage; + + private int _viewsCount; + private readonly ulong _size; + + private int _bindCount; + private readonly TextureSliceInfo[] _slices; + + public VkFormat VkFormat { get; } + + public unsafe TextureStorage( + VulkanRenderer gd, + Device device, + TextureCreateInfo info, + Auto foreignAllocation = null) + { + _gd = gd; + _device = device; + _info = info; + + bool isMsImageStorageSupported = gd.Capabilities.SupportsShaderStorageImageMultisample || !info.Target.IsMultisample(); + + var format = _gd.FormatCapabilities.ConvertToVkFormat(info.Format, isMsImageStorageSupported); + var levels = (uint)info.Levels; + var layers = (uint)info.GetLayers(); + var depth = (uint)(info.Target == Target.Texture3D ? info.Depth : 1); + + VkFormat = format; + _depthOrLayers = info.GetDepthOrLayers(); + + var type = info.Target.Convert(); + + var extent = new Extent3D((uint)info.Width, (uint)info.Height, depth); + + var sampleCountFlags = ConvertToSampleCountFlags(gd.Capabilities.SupportedSampleCounts, (uint)info.Samples); + + var usage = GetImageUsage(info.Format, gd.Capabilities, isMsImageStorageSupported, true); + + var flags = ImageCreateFlags.CreateMutableFormatBit | ImageCreateFlags.CreateExtendedUsageBit; + + // This flag causes mipmapped texture arrays to break on AMD GCN, so for that copy dependencies are forced for aliasing as cube. + bool isCube = info.Target == Target.Cubemap || info.Target == Target.CubemapArray; + bool cubeCompatible = gd.IsAmdGcn ? isCube : (info.Width == info.Height && layers >= 6); + + if (type == ImageType.Type2D && cubeCompatible) + { + flags |= ImageCreateFlags.CreateCubeCompatibleBit; + } + + if (type == ImageType.Type3D && !gd.Capabilities.PortabilitySubset.HasFlag(PortabilitySubsetFlags.No3DImageView)) + { + flags |= ImageCreateFlags.Create2DArrayCompatibleBit; + } + + var imageCreateInfo = new ImageCreateInfo + { + SType = StructureType.ImageCreateInfo, + ImageType = type, + Format = format, + Extent = extent, + MipLevels = levels, + ArrayLayers = layers, + Samples = sampleCountFlags, + Tiling = ImageTiling.Optimal, + Usage = usage, + SharingMode = SharingMode.Exclusive, + InitialLayout = ImageLayout.Undefined, + Flags = flags, + }; + + gd.Api.CreateImage(device, in imageCreateInfo, null, out _image).ThrowOnError(); + + if (foreignAllocation == null) + { + gd.Api.GetImageMemoryRequirements(device, _image, out var requirements); + var allocation = gd.MemoryAllocator.AllocateDeviceMemory(requirements, DefaultImageMemoryFlags); + + if (allocation.Memory.Handle == 0UL) + { + gd.Api.DestroyImage(device, _image, null); + throw new Exception("Image initialization failed."); + } + + _size = requirements.Size; + + gd.Api.BindImageMemory(device, _image, allocation.Memory, allocation.Offset).ThrowOnError(); + + _allocationAuto = new Auto(allocation); + _imageAuto = new Auto(new DisposableImage(_gd.Api, device, _image), null, _allocationAuto); + + InitialTransition(ImageLayout.Undefined, ImageLayout.General); + } + else + { + _foreignAllocationAuto = foreignAllocation; + foreignAllocation.IncrementReferenceCount(); + var allocation = foreignAllocation.GetUnsafe(); + + gd.Api.BindImageMemory(device, _image, allocation.Memory, allocation.Offset).ThrowOnError(); + + _imageAuto = new Auto(new DisposableImage(_gd.Api, device, _image)); + + InitialTransition(ImageLayout.Preinitialized, ImageLayout.General); + } + + _slices = new TextureSliceInfo[levels * _depthOrLayers]; + } + + public TextureStorage CreateAliasedColorForDepthStorageUnsafe(Format format) + { + var colorFormat = format switch + { + Format.S8Uint => Format.R8Unorm, + Format.D16Unorm => Format.R16Unorm, + Format.D24UnormS8Uint or Format.S8UintD24Unorm or Format.X8UintD24Unorm => Format.R8G8B8A8Unorm, + Format.D32Float => Format.R32Float, + Format.D32FloatS8Uint => Format.R32G32Float, + _ => throw new ArgumentException($"\"{format}\" is not a supported depth or stencil format."), + }; + + return CreateAliasedStorageUnsafe(colorFormat); + } + + public TextureStorage CreateAliasedStorageUnsafe(Format format) + { + if (_aliasedStorages == null || !_aliasedStorages.TryGetValue(format, out var storage)) + { + _aliasedStorages ??= new Dictionary(); + + var info = NewCreateInfoWith(ref _info, format, _info.BytesPerPixel); + + storage = new TextureStorage(_gd, _device, info, _allocationAuto); + + _aliasedStorages.Add(format, storage); + } + + return storage; + } + + public static TextureCreateInfo NewCreateInfoWith(ref TextureCreateInfo info, Format format, int bytesPerPixel) + { + return NewCreateInfoWith(ref info, format, bytesPerPixel, info.Width, info.Height); + } + + public static TextureCreateInfo NewCreateInfoWith( + ref TextureCreateInfo info, + Format format, + int bytesPerPixel, + int width, + int height) + { + return new TextureCreateInfo( + width, + height, + info.Depth, + info.Levels, + info.Samples, + info.BlockWidth, + info.BlockHeight, + bytesPerPixel, + format, + info.DepthStencilMode, + info.Target, + info.SwizzleR, + info.SwizzleG, + info.SwizzleB, + info.SwizzleA); + } + + public Auto GetImage() + { + return _imageAuto; + } + + public Image GetImageForViewCreation() + { + return _image; + } + + public bool HasCommandBufferDependency(CommandBufferScoped cbs) + { + if (_foreignAllocationAuto != null) + { + return _foreignAllocationAuto.HasCommandBufferDependency(cbs); + } + else if (_allocationAuto != null) + { + return _allocationAuto.HasCommandBufferDependency(cbs); + } + + return false; + } + + private unsafe void InitialTransition(ImageLayout srcLayout, ImageLayout dstLayout) + { + CommandBufferScoped cbs; + bool useTempCbs = !_gd.CommandBufferPool.OwnedByCurrentThread; + + if (useTempCbs) + { + cbs = _gd.BackgroundResources.Get().GetPool().Rent(); + } + else + { + if (_gd.PipelineInternal != null) + { + cbs = _gd.PipelineInternal.GetPreloadCommandBuffer(); + } + else + { + cbs = _gd.CommandBufferPool.Rent(); + useTempCbs = true; + } + } + + var aspectFlags = _info.Format.ConvertAspectFlags(); + + var subresourceRange = new ImageSubresourceRange(aspectFlags, 0, (uint)_info.Levels, 0, (uint)_info.GetLayers()); + + var barrier = new ImageMemoryBarrier + { + SType = StructureType.ImageMemoryBarrier, + SrcAccessMask = 0, + DstAccessMask = DefaultAccessMask, + OldLayout = srcLayout, + NewLayout = dstLayout, + SrcQueueFamilyIndex = Vk.QueueFamilyIgnored, + DstQueueFamilyIndex = Vk.QueueFamilyIgnored, + Image = _imageAuto.Get(cbs).Value, + SubresourceRange = subresourceRange, + }; + + _gd.Api.CmdPipelineBarrier( + cbs.CommandBuffer, + PipelineStageFlags.TopOfPipeBit, + PipelineStageFlags.AllCommandsBit, + 0, + 0, + null, + 0, + null, + 1, + in barrier); + + if (useTempCbs) + { + cbs.Dispose(); + } + } + + public static ImageUsageFlags GetImageUsage(Format format, in HardwareCapabilities capabilities, bool isMsImageStorageSupported, bool extendedUsage) + { + var usage = DefaultUsageFlags; + + if (format.IsDepthOrStencil()) + { + usage |= ImageUsageFlags.DepthStencilAttachmentBit; + } + else if (format.IsRtColorCompatible()) + { + usage |= ImageUsageFlags.ColorAttachmentBit; + } + + if ((format.IsImageCompatible() && isMsImageStorageSupported) || extendedUsage) + { + usage |= ImageUsageFlags.StorageBit; + } + + if (capabilities.SupportsAttachmentFeedbackLoop && + (usage & (ImageUsageFlags.DepthStencilAttachmentBit | ImageUsageFlags.ColorAttachmentBit)) != 0) + { + usage |= ImageUsageFlags.AttachmentFeedbackLoopBitExt; + } + + return usage; + } + + public static SampleCountFlags ConvertToSampleCountFlags(SampleCountFlags supportedSampleCounts, uint samples) + { + if (samples == 0 || samples > (uint)SampleCountFlags.Count64Bit) + { + return SampleCountFlags.Count1Bit; + } + + // Round up to the nearest power of two. + SampleCountFlags converted = (SampleCountFlags)(1u << (31 - BitOperations.LeadingZeroCount(samples))); + + // Pick nearest sample count that the host actually supports. + while (converted != SampleCountFlags.Count1Bit && (converted & supportedSampleCounts) == 0) + { + converted = (SampleCountFlags)((uint)converted >> 1); + } + + return converted; + } + + public TextureView CreateView(TextureCreateInfo info, int firstLayer, int firstLevel) + { + return new TextureView(_gd, _device, info, this, firstLayer, firstLevel); + } + + public void CopyFromOrToBuffer( + CommandBuffer commandBuffer, + VkBuffer buffer, + Image image, + int size, + bool to, + int x, + int y, + int dstLayer, + int dstLevel, + int dstLayers, + int dstLevels, + bool singleSlice, + ImageAspectFlags aspectFlags, + bool forFlush) + { + bool is3D = Info.Target == Target.Texture3D; + int width = Info.Width; + int height = Info.Height; + int depth = is3D && !singleSlice ? Info.Depth : 1; + int layer = is3D ? 0 : dstLayer; + int layers = dstLayers; + int levels = dstLevels; + + int offset = 0; + + for (int level = 0; level < levels; level++) + { + int mipSize = Info.GetMipSize(level); + + if (forFlush) + { + mipSize = GetBufferDataLength(mipSize); + } + + int endOffset = offset + mipSize; + + if ((uint)endOffset > (uint)size) + { + break; + } + + int rowLength = (Info.GetMipStride(level) / Info.BytesPerPixel) * Info.BlockWidth; + + var sl = new ImageSubresourceLayers( + aspectFlags, + (uint)(dstLevel + level), + (uint)layer, + (uint)layers); + + var extent = new Extent3D((uint)width, (uint)height, (uint)depth); + + int z = is3D ? dstLayer : 0; + + var region = new BufferImageCopy( + (ulong)offset, + (uint)BitUtils.AlignUp(rowLength, Info.BlockWidth), + (uint)BitUtils.AlignUp(height, Info.BlockHeight), + sl, + new Offset3D(x, y, z), + extent); + + if (to) + { + _gd.Api.CmdCopyImageToBuffer(commandBuffer, image, ImageLayout.General, buffer, 1, in region); + } + else + { + _gd.Api.CmdCopyBufferToImage(commandBuffer, buffer, image, ImageLayout.General, 1, in region); + } + + offset += mipSize; + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + + if (Info.Target == Target.Texture3D) + { + depth = Math.Max(1, depth >> 1); + } + } + } + + private int GetBufferDataLength(int length) + { + if (NeedsD24S8Conversion()) + { + return length * 2; + } + + return length; + } + + private bool NeedsD24S8Conversion() + { + return FormatCapabilities.IsD24S8(Info.Format) && VkFormat == VkFormat.D32SfloatS8Uint; + } + + public void AddStoreOpUsage(bool depthStencil) + { + _lastModificationStage = depthStencil ? + PipelineStageFlags.LateFragmentTestsBit : + PipelineStageFlags.ColorAttachmentOutputBit; + + _lastModificationAccess = depthStencil ? + AccessFlags.DepthStencilAttachmentWriteBit : + AccessFlags.ColorAttachmentWriteBit; + } + + public void QueueLoadOpBarrier(CommandBufferScoped cbs, bool depthStencil) + { + PipelineStageFlags srcStageFlags = _lastReadStage | _lastModificationStage; + PipelineStageFlags dstStageFlags = depthStencil ? + PipelineStageFlags.EarlyFragmentTestsBit | PipelineStageFlags.LateFragmentTestsBit : + PipelineStageFlags.ColorAttachmentOutputBit; + + AccessFlags srcAccessFlags = _lastModificationAccess | _lastReadAccess; + AccessFlags dstAccessFlags = depthStencil ? + AccessFlags.DepthStencilAttachmentWriteBit | AccessFlags.DepthStencilAttachmentReadBit : + AccessFlags.ColorAttachmentWriteBit | AccessFlags.ColorAttachmentReadBit; + + if (srcAccessFlags != AccessFlags.None) + { + ImageAspectFlags aspectFlags = Info.Format.ConvertAspectFlags(); + ImageMemoryBarrier barrier = TextureView.GetImageBarrier( + _imageAuto.Get(cbs).Value, + srcAccessFlags, + dstAccessFlags, + aspectFlags, + 0, + 0, + _info.GetLayers(), + _info.Levels); + + _gd.Barriers.QueueBarrier(barrier, this, srcStageFlags, dstStageFlags); + + _lastReadStage = PipelineStageFlags.None; + _lastReadAccess = AccessFlags.None; + } + + _lastModificationStage = depthStencil ? + PipelineStageFlags.LateFragmentTestsBit : + PipelineStageFlags.ColorAttachmentOutputBit; + + _lastModificationAccess = depthStencil ? + AccessFlags.DepthStencilAttachmentWriteBit : + AccessFlags.ColorAttachmentWriteBit; + } + + public void QueueWriteToReadBarrier(CommandBufferScoped cbs, AccessFlags dstAccessFlags, PipelineStageFlags dstStageFlags) + { + _lastReadAccess |= dstAccessFlags; + _lastReadStage |= dstStageFlags; + + if (_lastModificationAccess != AccessFlags.None) + { + ImageAspectFlags aspectFlags = Info.Format.ConvertAspectFlags(); + ImageMemoryBarrier barrier = TextureView.GetImageBarrier( + _imageAuto.Get(cbs).Value, + _lastModificationAccess, + dstAccessFlags, + aspectFlags, + 0, + 0, + _info.GetLayers(), + _info.Levels); + + _gd.Barriers.QueueBarrier(barrier, this, _lastModificationStage, dstStageFlags); + + _lastModificationAccess = AccessFlags.None; + } + } + + public void AddBinding(TextureView view) + { + // Assumes a view only has a first level. + + int index = view.FirstLevel * _depthOrLayers + view.FirstLayer; + int layers = view.Layers; + + for (int i = 0; i < layers; i++) + { + ref TextureSliceInfo info = ref _slices[index++]; + + info.BindCount++; + } + + _bindCount++; + } + + public void ClearBindings() + { + if (_bindCount != 0) + { + Array.Clear(_slices, 0, _slices.Length); + + _bindCount = 0; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool IsBound(TextureView view) + { + if (_bindCount != 0) + { + int index = view.FirstLevel * _depthOrLayers + view.FirstLayer; + int layers = view.Layers; + + for (int i = 0; i < layers; i++) + { + ref TextureSliceInfo info = ref _slices[index++]; + + if (info.BindCount != 0) + { + return true; + } + } + } + + return false; + } + + public void IncrementViewsCount() + { + _viewsCount++; + } + + public void DecrementViewsCount() + { + if (--_viewsCount == 0) + { + _gd.PipelineInternal?.FlushCommandsIfWeightExceeding(_imageAuto, _size); + + Dispose(); + } + } + + public void Dispose() + { + Disposed = true; + + if (_aliasedStorages != null) + { + foreach (var storage in _aliasedStorages.Values) + { + storage.Dispose(); + } + + _aliasedStorages.Clear(); + } + + _imageAuto.Dispose(); + _allocationAuto?.Dispose(); + _foreignAllocationAuto?.DecrementReferenceCount(); + _foreignAllocationAuto = null; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/TextureView.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureView.cs new file mode 100644 index 000000000..d721d62b8 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/TextureView.cs @@ -0,0 +1,1155 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using Format = Ryujinx.Graphics.GAL.Format; +using VkBuffer = Silk.NET.Vulkan.Buffer; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class TextureView : ITexture, IDisposable + { + private readonly VulkanRenderer _gd; + + private readonly Device _device; + + private readonly Auto _imageView; + private readonly Auto _imageViewDraw; + private readonly Auto _imageViewIdentity; + private readonly Auto _imageView2dArray; + private Dictionary _selfManagedViews; + + private int _hazardUses; + + private readonly TextureCreateInfo _info; + + private HashTableSlim _renderPasses; + + public TextureCreateInfo Info => _info; + + public TextureStorage Storage { get; } + + public int Width => Info.Width; + public int Height => Info.Height; + public int Layers => Info.GetDepthOrLayers(); + public int FirstLayer { get; } + public int FirstLevel { get; } + public VkFormat VkFormat { get; } + private int _isValid; + public bool Valid => Volatile.Read(ref _isValid) != 0; + + public TextureView( + VulkanRenderer gd, + Device device, + TextureCreateInfo info, + TextureStorage storage, + int firstLayer, + int firstLevel) + { + _gd = gd; + _device = device; + _info = info; + Storage = storage; + FirstLayer = firstLayer; + FirstLevel = firstLevel; + + storage.IncrementViewsCount(); + + gd.Textures.Add(this); + + bool isMsImageStorageSupported = gd.Capabilities.SupportsShaderStorageImageMultisample || !info.Target.IsMultisample(); + + var format = _gd.FormatCapabilities.ConvertToVkFormat(info.Format, isMsImageStorageSupported); + var usage = TextureStorage.GetImageUsage(info.Format, gd.Capabilities, isMsImageStorageSupported, false); + + var levels = (uint)info.Levels; + var layers = (uint)info.GetLayers(); + + VkFormat = format; + + var type = info.Target.ConvertView(); + + var swizzleR = info.SwizzleR.Convert(); + var swizzleG = info.SwizzleG.Convert(); + var swizzleB = info.SwizzleB.Convert(); + var swizzleA = info.SwizzleA.Convert(); + + if (info.Format == Format.R5G5B5A1Unorm || + info.Format == Format.R5G5B5X1Unorm || + info.Format == Format.R5G6B5Unorm) + { + (swizzleB, swizzleR) = (swizzleR, swizzleB); + } + else if (VkFormat == VkFormat.R4G4B4A4UnormPack16 || info.Format == Format.A1B5G5R5Unorm) + { + var tempB = swizzleB; + var tempA = swizzleA; + + swizzleB = swizzleG; + swizzleA = swizzleR; + swizzleR = tempA; + swizzleG = tempB; + } + + var componentMapping = new ComponentMapping(swizzleR, swizzleG, swizzleB, swizzleA); + + var aspectFlags = info.Format.ConvertAspectFlags(info.DepthStencilMode); + var aspectFlagsDepth = info.Format.ConvertAspectFlags(); + + var subresourceRange = new ImageSubresourceRange(aspectFlags, (uint)firstLevel, levels, (uint)firstLayer, layers); + var subresourceRangeDepth = new ImageSubresourceRange(aspectFlagsDepth, (uint)firstLevel, levels, (uint)firstLayer, layers); + + unsafe Auto CreateImageView(ComponentMapping cm, ImageSubresourceRange sr, ImageViewType viewType, ImageUsageFlags usageFlags) + { + var imageViewUsage = new ImageViewUsageCreateInfo + { + SType = StructureType.ImageViewUsageCreateInfo, + Usage = usageFlags, + }; + + var imageCreateInfo = new ImageViewCreateInfo + { + SType = StructureType.ImageViewCreateInfo, + Image = storage.GetImageForViewCreation(), + ViewType = viewType, + Format = format, + Components = cm, + SubresourceRange = sr, + PNext = &imageViewUsage, + }; + + gd.Api.CreateImageView(device, in imageCreateInfo, null, out var imageView).ThrowOnError(); + return new Auto(new DisposableImageView(gd.Api, device, imageView), null, storage.GetImage()); + } + + ImageUsageFlags shaderUsage = ImageUsageFlags.SampledBit; + + if (info.Format.IsImageCompatible() && (_gd.Capabilities.SupportsShaderStorageImageMultisample || !info.Target.IsMultisample())) + { + shaderUsage |= ImageUsageFlags.StorageBit; + } + + _imageView = CreateImageView(componentMapping, subresourceRange, type, shaderUsage); + + // Framebuffer attachments and storage images requires a identity component mapping. + var identityComponentMapping = new ComponentMapping( + ComponentSwizzle.R, + ComponentSwizzle.G, + ComponentSwizzle.B, + ComponentSwizzle.A); + + _imageViewDraw = CreateImageView(identityComponentMapping, subresourceRangeDepth, type, usage); + _imageViewIdentity = aspectFlagsDepth == aspectFlags ? _imageViewDraw : CreateImageView(identityComponentMapping, subresourceRange, type, usage); + + // Framebuffer attachments also require 3D textures to be bound as 2D array. + if (info.Target == Target.Texture3D) + { + if (gd.Capabilities.PortabilitySubset.HasFlag(PortabilitySubsetFlags.No3DImageView)) + { + if (levels == 1 && (info.Format.IsRtColorCompatible() || info.Format.IsDepthOrStencil())) + { + subresourceRange = new ImageSubresourceRange(aspectFlags, (uint)firstLevel, levels, (uint)firstLayer, 1); + + _imageView2dArray = CreateImageView(identityComponentMapping, subresourceRange, ImageViewType.Type2D, ImageUsageFlags.ColorAttachmentBit); + } + } + else + { + subresourceRange = new ImageSubresourceRange(aspectFlags, (uint)firstLevel, 1, (uint)firstLayer, (uint)info.Depth); + + _imageView2dArray = CreateImageView(identityComponentMapping, subresourceRange, ImageViewType.Type2DArray, usage); + } + } + + _isValid = 1; + } + + /// + /// Create a texture view for an existing swapchain image view. + /// Does not set storage, so only appropriate for swapchain use. + /// + /// Do not use this for normal textures, and make sure uses do not try to read storage. + public TextureView(VulkanRenderer gd, Device device, DisposableImageView view, TextureCreateInfo info, VkFormat format) + { + _gd = gd; + _device = device; + + _imageView = new Auto(view); + _imageViewDraw = _imageView; + _imageViewIdentity = _imageView; + _info = info; + + VkFormat = format; + + _isValid = 1; + } + + public Auto GetImage() + { + return Storage.GetImage(); + } + + public Auto GetImageView() + { + return _imageView; + } + + public Auto GetIdentityImageView() + { + return _imageViewIdentity; + } + + public Auto GetImageViewForAttachment() + { + return _imageView2dArray ?? _imageViewDraw; + } + + public void CopyTo(ITexture destination, int firstLayer, int firstLevel) + { + var src = this; + var dst = (TextureView)destination; + + if (!Valid || !dst.Valid) + { + return; + } + + _gd.PipelineInternal.EndRenderPass(); + + var cbs = _gd.PipelineInternal.CurrentCommandBuffer; + + var srcImage = src.GetImage().Get(cbs).Value; + var dstImage = dst.GetImage().Get(cbs).Value; + + if (!dst.Info.Target.IsMultisample() && Info.Target.IsMultisample()) + { + int layers = Math.Min(Info.GetLayers(), dst.Info.GetLayers() - firstLayer); + _gd.HelperShader.CopyMSToNonMS(_gd, cbs, src, dst, 0, firstLayer, layers); + } + else if (dst.Info.Target.IsMultisample() && !Info.Target.IsMultisample()) + { + int layers = Math.Min(Info.GetLayers(), dst.Info.GetLayers() - firstLayer); + _gd.HelperShader.CopyNonMSToMS(_gd, cbs, src, dst, 0, firstLayer, layers); + } + else if (dst.Info.BytesPerPixel != Info.BytesPerPixel) + { + int layers = Math.Min(Info.GetLayers(), dst.Info.GetLayers() - firstLayer); + int levels = Math.Min(Info.Levels, dst.Info.Levels - firstLevel); + _gd.HelperShader.CopyIncompatibleFormats(_gd, cbs, src, dst, 0, firstLayer, 0, firstLevel, layers, levels); + } + else if (src.Info.Format.IsDepthOrStencil() != dst.Info.Format.IsDepthOrStencil()) + { + int layers = Math.Min(Info.GetLayers(), dst.Info.GetLayers() - firstLayer); + int levels = Math.Min(Info.Levels, dst.Info.Levels - firstLevel); + + _gd.HelperShader.CopyColor(_gd, cbs, src, dst, 0, firstLayer, 0, FirstLevel, layers, levels); + } + else + { + TextureCopy.Copy( + _gd.Api, + cbs.CommandBuffer, + srcImage, + dstImage, + src.Info, + dst.Info, + src.FirstLayer, + dst.FirstLayer, + src.FirstLevel, + dst.FirstLevel, + 0, + firstLayer, + 0, + firstLevel); + } + } + + public void CopyTo(ITexture destination, int srcLayer, int dstLayer, int srcLevel, int dstLevel) + { + var src = this; + var dst = (TextureView)destination; + + if (!Valid || !dst.Valid) + { + return; + } + + _gd.PipelineInternal.EndRenderPass(); + + var cbs = _gd.PipelineInternal.CurrentCommandBuffer; + + var srcImage = src.GetImage().Get(cbs).Value; + var dstImage = dst.GetImage().Get(cbs).Value; + + if (!dst.Info.Target.IsMultisample() && Info.Target.IsMultisample()) + { + _gd.HelperShader.CopyMSToNonMS(_gd, cbs, src, dst, srcLayer, dstLayer, 1); + } + else if (dst.Info.Target.IsMultisample() && !Info.Target.IsMultisample()) + { + _gd.HelperShader.CopyNonMSToMS(_gd, cbs, src, dst, srcLayer, dstLayer, 1); + } + else if (dst.Info.BytesPerPixel != Info.BytesPerPixel) + { + _gd.HelperShader.CopyIncompatibleFormats(_gd, cbs, src, dst, srcLayer, dstLayer, srcLevel, dstLevel, 1, 1); + } + else if (src.Info.Format.IsDepthOrStencil() != dst.Info.Format.IsDepthOrStencil()) + { + _gd.HelperShader.CopyColor(_gd, cbs, src, dst, srcLayer, dstLayer, srcLevel, dstLevel, 1, 1); + } + else + { + TextureCopy.Copy( + _gd.Api, + cbs.CommandBuffer, + srcImage, + dstImage, + src.Info, + dst.Info, + src.FirstLayer, + dst.FirstLayer, + src.FirstLevel, + dst.FirstLevel, + srcLayer, + dstLayer, + srcLevel, + dstLevel, + 1, + 1); + } + } + + public void CopyTo(ITexture destination, Extents2D srcRegion, Extents2D dstRegion, bool linearFilter) + { + var dst = (TextureView)destination; + + if (_gd.CommandBufferPool.OwnedByCurrentThread) + { + _gd.PipelineInternal.EndRenderPass(); + + var cbs = _gd.PipelineInternal.CurrentCommandBuffer; + + CopyToImpl(cbs, dst, srcRegion, dstRegion, linearFilter); + } + else + { + var cbp = _gd.BackgroundResources.Get().GetPool(); + + using var cbs = cbp.Rent(); + + CopyToImpl(cbs, dst, srcRegion, dstRegion, linearFilter); + } + } + + private void CopyToImpl(CommandBufferScoped cbs, TextureView dst, Extents2D srcRegion, Extents2D dstRegion, bool linearFilter) + { + var src = this; + + var srcFormat = GetCompatibleGalFormat(src.Info.Format); + var dstFormat = GetCompatibleGalFormat(dst.Info.Format); + + bool srcUsesStorageFormat = src.VkFormat == src.Storage.VkFormat; + bool dstUsesStorageFormat = dst.VkFormat == dst.Storage.VkFormat; + + int layers = Math.Min(dst.Info.GetDepthOrLayers(), src.Info.GetDepthOrLayers()); + int levels = Math.Min(dst.Info.Levels, src.Info.Levels); + + if (srcUsesStorageFormat && dstUsesStorageFormat) + { + if ((srcRegion.X1 | dstRegion.X1) == 0 && + (srcRegion.Y1 | dstRegion.Y1) == 0 && + srcRegion.X2 == src.Width && + srcRegion.Y2 == src.Height && + dstRegion.X2 == dst.Width && + dstRegion.Y2 == dst.Height && + src.Width == dst.Width && + src.Height == dst.Height && + src.VkFormat == dst.VkFormat) + { + if (src.Info.Samples > 1 && src.Info.Samples != dst.Info.Samples && src.Info.Format.IsDepthOrStencil()) + { + // CmdResolveImage does not support depth-stencil resolve, so we need to use an alternative path + // for those textures. + TextureCopy.ResolveDepthStencil(_gd, _device, cbs, src, dst); + } + else + { + TextureCopy.Copy( + _gd.Api, + cbs.CommandBuffer, + src.GetImage().Get(cbs).Value, + dst.GetImage().Get(cbs).Value, + src.Info, + dst.Info, + src.FirstLayer, + dst.FirstLayer, + src.FirstLevel, + dst.FirstLevel, + 0, + 0, + 0, + 0, + layers, + levels); + } + + return; + } + + if (_gd.FormatCapabilities.OptimalFormatSupports(FormatFeatureFlags.BlitSrcBit, srcFormat) && + _gd.FormatCapabilities.OptimalFormatSupports(FormatFeatureFlags.BlitDstBit, dstFormat)) + { + TextureCopy.Blit( + _gd.Api, + cbs.CommandBuffer, + src.GetImage().Get(cbs).Value, + dst.GetImage().Get(cbs).Value, + src.Info, + dst.Info, + srcRegion, + dstRegion, + src.FirstLayer, + dst.FirstLayer, + src.FirstLevel, + dst.FirstLevel, + layers, + levels, + linearFilter); + + return; + } + } + + bool isDepthOrStencil = dst.Info.Format.IsDepthOrStencil(); + + if (!VulkanConfiguration.UseUnsafeBlit || (_gd.Vendor != Vendor.Nvidia && _gd.Vendor != Vendor.Intel)) + { + _gd.HelperShader.Blit( + _gd, + src, + dst, + srcRegion, + dstRegion, + layers, + levels, + isDepthOrStencil, + linearFilter); + + return; + } + + Auto srcImage; + Auto dstImage; + + if (isDepthOrStencil) + { + srcImage = src.Storage.CreateAliasedColorForDepthStorageUnsafe(srcFormat).GetImage(); + dstImage = dst.Storage.CreateAliasedColorForDepthStorageUnsafe(dstFormat).GetImage(); + } + else + { + srcImage = src.Storage.CreateAliasedStorageUnsafe(srcFormat).GetImage(); + dstImage = dst.Storage.CreateAliasedStorageUnsafe(dstFormat).GetImage(); + } + + TextureCopy.Blit( + _gd.Api, + cbs.CommandBuffer, + srcImage.Get(cbs).Value, + dstImage.Get(cbs).Value, + src.Info, + dst.Info, + srcRegion, + dstRegion, + src.FirstLayer, + dst.FirstLayer, + src.FirstLevel, + dst.FirstLevel, + layers, + levels, + linearFilter, + ImageAspectFlags.ColorBit, + ImageAspectFlags.ColorBit); + } + + public static unsafe void InsertMemoryBarrier( + Vk api, + CommandBuffer commandBuffer, + AccessFlags srcAccessMask, + AccessFlags dstAccessMask, + PipelineStageFlags srcStageMask, + PipelineStageFlags dstStageMask) + { + MemoryBarrier memoryBarrier = new() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = srcAccessMask, + DstAccessMask = dstAccessMask, + }; + + api.CmdPipelineBarrier( + commandBuffer, + srcStageMask, + dstStageMask, + DependencyFlags.None, + 1, + in memoryBarrier, + 0, + null, + 0, + null); + } + + public static ImageMemoryBarrier GetImageBarrier( + Image image, + AccessFlags srcAccessMask, + AccessFlags dstAccessMask, + ImageAspectFlags aspectFlags, + int firstLayer, + int firstLevel, + int layers, + int levels) + { + return new() + { + SType = StructureType.ImageMemoryBarrier, + SrcAccessMask = srcAccessMask, + DstAccessMask = dstAccessMask, + SrcQueueFamilyIndex = Vk.QueueFamilyIgnored, + DstQueueFamilyIndex = Vk.QueueFamilyIgnored, + Image = image, + OldLayout = ImageLayout.General, + NewLayout = ImageLayout.General, + SubresourceRange = new ImageSubresourceRange(aspectFlags, (uint)firstLevel, (uint)levels, (uint)firstLayer, (uint)layers), + }; + } + + public static unsafe void InsertImageBarrier( + Vk api, + CommandBuffer commandBuffer, + Image image, + AccessFlags srcAccessMask, + AccessFlags dstAccessMask, + PipelineStageFlags srcStageMask, + PipelineStageFlags dstStageMask, + ImageAspectFlags aspectFlags, + int firstLayer, + int firstLevel, + int layers, + int levels) + { + ImageMemoryBarrier memoryBarrier = GetImageBarrier( + image, + srcAccessMask, + dstAccessMask, + aspectFlags, + firstLayer, + firstLevel, + layers, + levels); + + api.CmdPipelineBarrier( + commandBuffer, + srcStageMask, + dstStageMask, + 0, + 0, + null, + 0, + null, + 1, + in memoryBarrier); + } + + public TextureView GetView(Format format) + { + if (format == Info.Format) + { + return this; + } + + if (_selfManagedViews != null && _selfManagedViews.TryGetValue(format, out var view)) + { + return view; + } + + view = CreateViewImpl(new TextureCreateInfo( + Info.Width, + Info.Height, + Info.Depth, + Info.Levels, + Info.Samples, + Info.BlockWidth, + Info.BlockHeight, + Info.BytesPerPixel, + format, + Info.DepthStencilMode, + Info.Target, + Info.SwizzleR, + Info.SwizzleG, + Info.SwizzleB, + Info.SwizzleA), 0, 0); + + (_selfManagedViews ??= new Dictionary()).Add(format, view); + + return view; + } + + public ITexture CreateView(TextureCreateInfo info, int firstLayer, int firstLevel) + { + return CreateViewImpl(info, firstLayer, firstLevel); + } + + public TextureView CreateViewImpl(TextureCreateInfo info, int firstLayer, int firstLevel) + { + return new TextureView(_gd, _device, info, Storage, FirstLayer + firstLayer, FirstLevel + firstLevel); + } + + public byte[] GetData(int x, int y, int width, int height) + { + int size = width * height * Info.BytesPerPixel; + using var bufferHolder = _gd.BufferManager.Create(_gd, size); + + using (var cbs = _gd.CommandBufferPool.Rent()) + { + var buffer = bufferHolder.GetBuffer(cbs.CommandBuffer).Get(cbs).Value; + var image = GetImage().Get(cbs).Value; + + CopyFromOrToBuffer(cbs.CommandBuffer, buffer, image, size, true, 0, 0, x, y, width, height); + } + + bufferHolder.WaitForFences(); + byte[] bitmap = new byte[size]; + GetDataFromBuffer(bufferHolder.GetDataStorage(0, size), size, Span.Empty).CopyTo(bitmap); + return bitmap; + } + + public PinnedSpan GetData() + { + BackgroundResource resources = _gd.BackgroundResources.Get(); + + if (_gd.CommandBufferPool.OwnedByCurrentThread) + { + _gd.FlushAllCommands(); + + return PinnedSpan.UnsafeFromSpan(GetData(_gd.CommandBufferPool, resources.GetFlushBuffer())); + } + + return PinnedSpan.UnsafeFromSpan(GetData(resources.GetPool(), resources.GetFlushBuffer())); + } + + public PinnedSpan GetData(int layer, int level) + { + BackgroundResource resources = _gd.BackgroundResources.Get(); + + if (_gd.CommandBufferPool.OwnedByCurrentThread) + { + _gd.FlushAllCommands(); + + return PinnedSpan.UnsafeFromSpan(GetData(_gd.CommandBufferPool, resources.GetFlushBuffer(), layer, level)); + } + + return PinnedSpan.UnsafeFromSpan(GetData(resources.GetPool(), resources.GetFlushBuffer(), layer, level)); + } + + public void CopyTo(BufferRange range, int layer, int level, int stride) + { + _gd.PipelineInternal.EndRenderPass(); + var cbs = _gd.PipelineInternal.CurrentCommandBuffer; + + int outSize = Info.GetMipSize(level); + int hostSize = GetBufferDataLength(outSize); + + var image = GetImage().Get(cbs).Value; + int offset = range.Offset; + + Auto autoBuffer = _gd.BufferManager.GetBuffer(cbs.CommandBuffer, range.Handle, true); + VkBuffer buffer = autoBuffer.Get(cbs, range.Offset, outSize).Value; + + if (PrepareOutputBuffer(cbs, hostSize, buffer, out VkBuffer copyToBuffer, out BufferHolder tempCopyHolder)) + { + // No barrier necessary, as this is a temporary copy buffer. + offset = 0; + } + else + { + BufferHolder.InsertBufferBarrier( + _gd, + cbs.CommandBuffer, + copyToBuffer, + BufferHolder.DefaultAccessFlags, + AccessFlags.TransferWriteBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + offset, + outSize); + } + + InsertImageBarrier( + _gd.Api, + cbs.CommandBuffer, + image, + TextureStorage.DefaultAccessMask, + AccessFlags.TransferReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.TransferBit, + Info.Format.ConvertAspectFlags(), + FirstLayer + layer, + FirstLevel + level, + 1, + 1); + + CopyFromOrToBuffer(cbs.CommandBuffer, copyToBuffer, image, hostSize, true, layer, level, 1, 1, singleSlice: true, offset, stride); + + if (tempCopyHolder != null) + { + CopyDataToOutputBuffer(cbs, tempCopyHolder, autoBuffer, hostSize, range.Offset); + tempCopyHolder.Dispose(); + } + else + { + BufferHolder.InsertBufferBarrier( + _gd, + cbs.CommandBuffer, + copyToBuffer, + AccessFlags.TransferWriteBit, + BufferHolder.DefaultAccessFlags, + PipelineStageFlags.TransferBit, + PipelineStageFlags.AllCommandsBit, + offset, + outSize); + } + } + + private ReadOnlySpan GetData(CommandBufferPool cbp, PersistentFlushBuffer flushBuffer) + { + int size = 0; + + for (int level = 0; level < Info.Levels; level++) + { + size += Info.GetMipSize(level); + } + + size = GetBufferDataLength(size); + + Span result = flushBuffer.GetTextureData(cbp, this, size); + return GetDataFromBuffer(result, size, result); + } + + private ReadOnlySpan GetData(CommandBufferPool cbp, PersistentFlushBuffer flushBuffer, int layer, int level) + { + int size = GetBufferDataLength(Info.GetMipSize(level)); + + Span result = flushBuffer.GetTextureData(cbp, this, size, layer, level); + return GetDataFromBuffer(result, size, result); + } + + /// + public void SetData(MemoryOwner data) + { + SetData(data.Span, 0, 0, Info.GetLayers(), Info.Levels, singleSlice: false); + data.Dispose(); + } + + /// + public void SetData(MemoryOwner data, int layer, int level) + { + SetData(data.Span, layer, level, 1, 1, singleSlice: true); + data.Dispose(); + } + + /// + public void SetData(MemoryOwner data, int layer, int level, Rectangle region) + { + SetData(data.Span, layer, level, 1, 1, singleSlice: true, region); + data.Dispose(); + } + + private void SetData(ReadOnlySpan data, int layer, int level, int layers, int levels, bool singleSlice, Rectangle? region = null) + { + int bufferDataLength = GetBufferDataLength(data.Length); + + using var bufferHolder = _gd.BufferManager.Create(_gd, bufferDataLength); + + Auto imageAuto = GetImage(); + + // Load texture data inline if the texture has been used on the current command buffer. + + bool loadInline = Storage.HasCommandBufferDependency(_gd.PipelineInternal.CurrentCommandBuffer); + + var cbs = loadInline ? _gd.PipelineInternal.CurrentCommandBuffer : _gd.PipelineInternal.GetPreloadCommandBuffer(); + + if (loadInline) + { + _gd.PipelineInternal.EndRenderPass(); + } + + CopyDataToBuffer(bufferHolder.GetDataStorage(0, bufferDataLength), data); + + var buffer = bufferHolder.GetBuffer(cbs.CommandBuffer).Get(cbs).Value; + var image = imageAuto.Get(cbs).Value; + + if (region.HasValue) + { + CopyFromOrToBuffer( + cbs.CommandBuffer, + buffer, + image, + bufferDataLength, + false, + layer, + level, + region.Value.X, + region.Value.Y, + region.Value.Width, + region.Value.Height); + } + else + { + CopyFromOrToBuffer(cbs.CommandBuffer, buffer, image, bufferDataLength, false, layer, level, layers, levels, singleSlice); + } + } + + private int GetBufferDataLength(int length) + { + if (NeedsD24S8Conversion()) + { + return length * 2; + } + + return length; + } + + private Format GetCompatibleGalFormat(Format format) + { + if (NeedsD24S8Conversion()) + { + return Format.D32FloatS8Uint; + } + + return format; + } + + private void CopyDataToBuffer(Span storage, ReadOnlySpan input) + { + if (NeedsD24S8Conversion()) + { + FormatConverter.ConvertD24S8ToD32FS8(storage, input); + return; + } + + input.CopyTo(storage); + } + + private ReadOnlySpan GetDataFromBuffer(ReadOnlySpan storage, int size, Span output) + { + if (NeedsD24S8Conversion()) + { + if (output.IsEmpty) + { + output = new byte[GetBufferDataLength(size)]; + } + + FormatConverter.ConvertD32FS8ToD24S8(output, storage); + return output; + } + + return storage; + } + + private bool PrepareOutputBuffer(CommandBufferScoped cbs, int hostSize, VkBuffer target, out VkBuffer copyTarget, out BufferHolder copyTargetHolder) + { + if (NeedsD24S8Conversion()) + { + copyTargetHolder = _gd.BufferManager.Create(_gd, hostSize); + copyTarget = copyTargetHolder.GetBuffer().Get(cbs, 0, hostSize).Value; + + return true; + } + + copyTarget = target; + copyTargetHolder = null; + + return false; + } + + private void CopyDataToOutputBuffer(CommandBufferScoped cbs, BufferHolder hostData, Auto copyTarget, int hostSize, int dstOffset) + { + if (NeedsD24S8Conversion()) + { + _gd.HelperShader.ConvertD32S8ToD24S8(_gd, cbs, hostData, copyTarget, hostSize / (2 * sizeof(int)), dstOffset); + } + } + + private bool NeedsD24S8Conversion() + { + return FormatCapabilities.IsD24S8(Info.Format) && VkFormat == VkFormat.D32SfloatS8Uint; + } + + public void CopyFromOrToBuffer( + CommandBuffer commandBuffer, + VkBuffer buffer, + Image image, + int size, + bool to, + int dstLayer, + int dstLevel, + int dstLayers, + int dstLevels, + bool singleSlice, + int offset = 0, + int stride = 0) + { + bool is3D = Info.Target == Target.Texture3D; + int width = Math.Max(1, Info.Width >> dstLevel); + int height = Math.Max(1, Info.Height >> dstLevel); + int depth = is3D && !singleSlice ? Math.Max(1, Info.Depth >> dstLevel) : 1; + int layer = is3D ? 0 : dstLayer; + int layers = dstLayers; + int levels = dstLevels; + + for (int level = 0; level < levels; level++) + { + int mipSize = GetBufferDataLength(is3D && !singleSlice + ? Info.GetMipSize(dstLevel + level) + : Info.GetMipSize2D(dstLevel + level) * dstLayers); + + int endOffset = offset + mipSize; + + if ((uint)endOffset > (uint)size) + { + break; + } + + int rowLength = ((stride == 0 ? Info.GetMipStride(dstLevel + level) : stride) / Info.BytesPerPixel) * Info.BlockWidth; + + var aspectFlags = Info.Format.ConvertAspectFlags(); + + if (aspectFlags == (ImageAspectFlags.DepthBit | ImageAspectFlags.StencilBit)) + { + aspectFlags = ImageAspectFlags.DepthBit; + } + + var sl = new ImageSubresourceLayers( + aspectFlags, + (uint)(FirstLevel + dstLevel + level), + (uint)(FirstLayer + layer), + (uint)layers); + + var extent = new Extent3D((uint)width, (uint)height, (uint)depth); + + int z = is3D ? dstLayer : 0; + + var region = new BufferImageCopy( + (ulong)offset, + (uint)AlignUpNpot(rowLength, Info.BlockWidth), + (uint)AlignUpNpot(height, Info.BlockHeight), + sl, + new Offset3D(0, 0, z), + extent); + + if (to) + { + _gd.Api.CmdCopyImageToBuffer(commandBuffer, image, ImageLayout.General, buffer, 1, in region); + } + else + { + _gd.Api.CmdCopyBufferToImage(commandBuffer, buffer, image, ImageLayout.General, 1, in region); + } + + offset += mipSize; + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + + if (Info.Target == Target.Texture3D) + { + depth = Math.Max(1, depth >> 1); + } + } + } + + private void CopyFromOrToBuffer( + CommandBuffer commandBuffer, + VkBuffer buffer, + Image image, + int size, + bool to, + int dstLayer, + int dstLevel, + int x, + int y, + int width, + int height) + { + var aspectFlags = Info.Format.ConvertAspectFlags(); + + if (aspectFlags == (ImageAspectFlags.DepthBit | ImageAspectFlags.StencilBit)) + { + aspectFlags = ImageAspectFlags.DepthBit; + } + + var sl = new ImageSubresourceLayers(aspectFlags, (uint)(FirstLevel + dstLevel), (uint)(FirstLayer + dstLayer), 1); + + var extent = new Extent3D((uint)width, (uint)height, 1); + + int rowLengthAlignment = Info.BlockWidth; + + // We expect all data being written into the texture to have a stride aligned by 4. + if (!to && Info.BytesPerPixel < 4) + { + rowLengthAlignment = 4 / Info.BytesPerPixel; + } + + var region = new BufferImageCopy( + 0, + (uint)AlignUpNpot(width, rowLengthAlignment), + (uint)AlignUpNpot(height, Info.BlockHeight), + sl, + new Offset3D(x, y, 0), + extent); + + if (to) + { + _gd.Api.CmdCopyImageToBuffer(commandBuffer, image, ImageLayout.General, buffer, 1, in region); + } + else + { + _gd.Api.CmdCopyBufferToImage(commandBuffer, buffer, image, ImageLayout.General, 1, in region); + } + } + + private static int AlignUpNpot(int size, int alignment) + { + int remainder = size % alignment; + if (remainder == 0) + { + return size; + } + + return size + (alignment - remainder); + } + + public void SetStorage(BufferRange buffer) + { + throw new NotImplementedException(); + } + + public void PrepareForUsage(CommandBufferScoped cbs, PipelineStageFlags flags, List feedbackLoopHazards) + { + Storage.QueueWriteToReadBarrier(cbs, AccessFlags.ShaderReadBit, flags); + + if (feedbackLoopHazards != null && Storage.IsBound(this)) + { + feedbackLoopHazards.Add(this); + _hazardUses++; + } + } + + public void ClearUsage(List feedbackLoopHazards) + { + if (_hazardUses != 0 && feedbackLoopHazards != null) + { + feedbackLoopHazards.Remove(this); + _hazardUses--; + } + } + + public void DecrementHazardUses() + { + if (_hazardUses != 0) + { + _hazardUses--; + } + } + + public (RenderPassHolder rpHolder, Auto framebuffer) GetPassAndFramebuffer( + VulkanRenderer gd, + Device device, + CommandBufferScoped cbs, + FramebufferParams fb) + { + var key = fb.GetRenderPassCacheKey(); + + if (_renderPasses == null || !_renderPasses.TryGetValue(ref key, out RenderPassHolder rpHolder)) + { + rpHolder = new RenderPassHolder(gd, device, key, fb); + } + + return (rpHolder, rpHolder.GetFramebuffer(gd, cbs, fb)); + } + + public void AddRenderPass(RenderPassCacheKey key, RenderPassHolder renderPass) + { + _renderPasses ??= new HashTableSlim(); + + _renderPasses.Add(ref key, renderPass); + } + + public void RemoveRenderPass(RenderPassCacheKey key) + { + _renderPasses.Remove(ref key); + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + bool wasValid = Interlocked.Exchange(ref _isValid, 0) != 0; + if (wasValid) + { + _gd.Textures.Remove(this); + + _imageView.Dispose(); + _imageView2dArray?.Dispose(); + + if (_imageViewIdentity != _imageView) + { + _imageViewIdentity.Dispose(); + } + + if (_imageViewDraw != _imageViewIdentity) + { + _imageViewDraw.Dispose(); + } + + Storage?.DecrementViewsCount(); + + if (_renderPasses != null) + { + var renderPasses = _renderPasses.Values.ToArray(); + + foreach (var pass in renderPasses) + { + pass.Dispose(); + } + } + + if (_selfManagedViews != null) + { + foreach (var view in _selfManagedViews.Values) + { + view.Dispose(); + } + + _selfManagedViews = null; + } + } + } + } + + public void Dispose() + { + Dispose(true); + } + + public void Release() + { + Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Vendor.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Vendor.cs new file mode 100644 index 000000000..e618d0f9e --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Vendor.cs @@ -0,0 +1,100 @@ +using Silk.NET.Vulkan; +using System.Text.RegularExpressions; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + enum Vendor + { + Amd, + ImgTec, + Intel, + Nvidia, + ARM, + Broadcom, + Qualcomm, + Apple, + Unknown, + } + + static partial class VendorUtils + { + [GeneratedRegex("Radeon (((HD|R(5|7|9|X)) )?((M?[2-6]\\d{2}(\\D|$))|([7-8]\\d{3}(\\D|$))|Fury|Nano))|(Pro Duo)")] + public static partial Regex AmdGcnRegex(); + + [GeneratedRegex("NVIDIA GeForce (R|G)?TX? (\\d{3}\\d?)M?")] + public static partial Regex NvidiaConsumerClassRegex(); + + public static Vendor FromId(uint id) + { + return id switch + { + 0x1002 => Vendor.Amd, + 0x1010 => Vendor.ImgTec, + 0x106B => Vendor.Apple, + 0x10DE => Vendor.Nvidia, + 0x13B5 => Vendor.ARM, + 0x14E4 => Vendor.Broadcom, + 0x8086 => Vendor.Intel, + 0x5143 => Vendor.Qualcomm, + _ => Vendor.Unknown, + }; + } + + public static string GetNameFromId(uint id) + { + return id switch + { + 0x1002 => "AMD", + 0x1010 => "ImgTec", + 0x106B => "Apple", + 0x10DE => "NVIDIA", + 0x13B5 => "ARM", + 0x14E4 => "Broadcom", + 0x1AE0 => "Google", + 0x5143 => "Qualcomm", + 0x8086 => "Intel", + 0x10001 => "Vivante", + 0x10002 => "VeriSilicon", + 0x10003 => "Kazan", + 0x10004 => "Codeplay Software Ltd.", + 0x10005 => "Mesa", + 0x10006 => "PoCL", + _ => $"0x{id:X}", + }; + } + + public static string GetFriendlyDriverName(DriverId id) + { + return id switch + { + DriverId.AmdProprietary => "AMD", + DriverId.AmdOpenSource => "AMD (Open)", + DriverId.MesaRadv => "RADV", + DriverId.NvidiaProprietary => "NVIDIA", + DriverId.IntelProprietaryWindows => "Intel", + DriverId.IntelOpenSourceMesa => "Intel (Open)", + DriverId.ImaginationProprietary => "Imagination", + DriverId.QualcommProprietary => "Qualcomm", + DriverId.ArmProprietary => "ARM", + DriverId.GoogleSwiftshader => "SwiftShader", + DriverId.GgpProprietary => "GGP", + DriverId.BroadcomProprietary => "Broadcom", + DriverId.MesaLlvmpipe => "LLVMpipe", + DriverId.Moltenvk => "MoltenVK", + DriverId.CoreaviProprietary => "CoreAVI", + DriverId.JuiceProprietary => "Juice", + DriverId.VerisiliconProprietary => "Verisilicon", + DriverId.MesaTurnip => "Turnip", + DriverId.MesaV3DV => "V3DV", + DriverId.MesaPanvk => "PanVK", + DriverId.SamsungProprietary => "Samsung", + DriverId.MesaVenus => "Venus", + DriverId.MesaDozen => "Dozen", + DriverId.MesaNvk => "NVK", + DriverId.ImaginationOpenSourceMesa => "Imagination (Open)", + DriverId.MesaAgxv => "Honeykrisp", + _ => id.ToString(), + }; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferState.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferState.cs new file mode 100644 index 000000000..a98b709bf --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferState.cs @@ -0,0 +1,139 @@ +using Ryujinx.Graphics.GAL; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal struct VertexBufferState + { + private const int VertexBufferMaxMirrorable = 0x20000; + + public static VertexBufferState Null => new(null, 0, 0, 0); + + private readonly int _offset; + private readonly int _size; + private readonly int _stride; + + private readonly BufferHandle _handle; + private Auto _buffer; + + internal readonly int DescriptorIndex; + internal int AttributeScalarAlignment; + + public VertexBufferState(Auto buffer, int descriptorIndex, int offset, int size, int stride = 0) + { + _buffer = buffer; + _handle = BufferHandle.Null; + + _offset = offset; + _size = size; + _stride = stride; + + DescriptorIndex = descriptorIndex; + AttributeScalarAlignment = 1; + + buffer?.IncrementReferenceCount(); + } + + public VertexBufferState(BufferHandle handle, int descriptorIndex, int offset, int size, int stride = 0) + { + // This buffer state may be rewritten at bind time, so it must be retrieved on bind. + + _buffer = null; + _handle = handle; + + _offset = offset; + _size = size; + _stride = stride; + + DescriptorIndex = descriptorIndex; + AttributeScalarAlignment = 1; + } + + public void BindVertexBuffer(VulkanRenderer gd, CommandBufferScoped cbs, uint binding, ref PipelineState state, VertexBufferUpdater updater) + { + var autoBuffer = _buffer; + + if (_handle != BufferHandle.Null) + { + // May need to restride the vertex buffer. + // + // Fix divide by zero when recovering from missed draw (Oct. 16 2024) + // (fixes crash in 'Baldo: The Guardian Owls' opening cutscene) + if (gd.NeedsVertexBufferAlignment(AttributeScalarAlignment, out int alignment) && alignment != 0 && (_stride % alignment) != 0) + { + autoBuffer = gd.BufferManager.GetAlignedVertexBuffer(cbs, _handle, _offset, _size, _stride, alignment); + + if (autoBuffer != null) + { + int stride = (_stride + (alignment - 1)) & -alignment; + int newSize = (_size / _stride) * stride; + + var buffer = autoBuffer.Get(cbs, 0, newSize).Value; + + updater.BindVertexBuffer(cbs, binding, buffer, 0, (ulong)newSize, (ulong)stride); + + _buffer = autoBuffer; + + state.Internal.VertexBindingDescriptions[DescriptorIndex].Stride = (uint)stride; + } + + return; + } + + autoBuffer = gd.BufferManager.GetBuffer(cbs.CommandBuffer, _handle, false, out int size); + + // The original stride must be reapplied in case it was rewritten. + state.Internal.VertexBindingDescriptions[DescriptorIndex].Stride = (uint)_stride; + + if (_offset >= size) + { + autoBuffer = null; + } + } + + if (autoBuffer != null) + { + int offset = _offset; + bool mirrorable = _size <= VertexBufferMaxMirrorable; + var buffer = mirrorable ? autoBuffer.GetMirrorable(cbs, ref offset, _size, out _).Value : autoBuffer.Get(cbs, offset, _size).Value; + + updater.BindVertexBuffer(cbs, binding, buffer, (ulong)offset, (ulong)_size, (ulong)_stride); + } + } + + public readonly bool BoundEquals(Auto buffer) + { + return _buffer == buffer; + } + + public readonly bool Overlaps(Auto buffer, int offset, int size) + { + return buffer == _buffer && offset < _offset + _size && offset + size > _offset; + } + + public readonly bool Matches(Auto buffer, int descriptorIndex, int offset, int size, int stride = 0) + { + return _buffer == buffer && DescriptorIndex == descriptorIndex && _offset == offset && _size == size && _stride == stride; + } + + public void Swap(Auto from, Auto to) + { + if (_buffer == from) + { + _buffer.DecrementReferenceCount(); + to.IncrementReferenceCount(); + + _buffer = to; + } + } + + public readonly void Dispose() + { + // Only dispose if this buffer is not refetched on each bind. + + if (_handle == BufferHandle.Null) + { + _buffer?.DecrementReferenceCount(); + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferUpdater.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferUpdater.cs new file mode 100644 index 000000000..04444e416 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VertexBufferUpdater.cs @@ -0,0 +1,82 @@ +using System; +using VkBuffer = Silk.NET.Vulkan.Buffer; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal class VertexBufferUpdater : IDisposable + { + private readonly VulkanRenderer _gd; + + private uint _baseBinding; + private uint _count; + + private readonly NativeArray _buffers; + private readonly NativeArray _offsets; + private readonly NativeArray _sizes; + private readonly NativeArray _strides; + + public VertexBufferUpdater(VulkanRenderer gd) + { + _gd = gd; + + _buffers = new NativeArray(Constants.MaxVertexBuffers); + _offsets = new NativeArray(Constants.MaxVertexBuffers); + _sizes = new NativeArray(Constants.MaxVertexBuffers); + _strides = new NativeArray(Constants.MaxVertexBuffers); + } + + public void BindVertexBuffer(CommandBufferScoped cbs, uint binding, VkBuffer buffer, ulong offset, ulong size, ulong stride) + { + if (_count == 0) + { + _baseBinding = binding; + } + else if (_baseBinding + _count != binding) + { + Commit(cbs); + _baseBinding = binding; + } + + int index = (int)_count; + + _buffers[index] = buffer; + _offsets[index] = offset; + _sizes[index] = size; + _strides[index] = stride; + + _count++; + } + + public unsafe void Commit(CommandBufferScoped cbs) + { + if (_count != 0) + { + if (_gd.Capabilities.SupportsExtendedDynamicState) + { + _gd.ExtendedDynamicStateApi.CmdBindVertexBuffers2( + cbs.CommandBuffer, + _baseBinding, + _count, + _buffers.Pointer, + _offsets.Pointer, + _sizes.Pointer, + _strides.Pointer); + } + else + { + _gd.Api.CmdBindVertexBuffers(cbs.CommandBuffer, _baseBinding, _count, _buffers.Pointer, _offsets.Pointer); + } + + _count = 0; + } + } + + public void Dispose() + { + _buffers.Dispose(); + _offsets.Dispose(); + _sizes.Dispose(); + _strides.Dispose(); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanConfiguration.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanConfiguration.cs new file mode 100644 index 000000000..e952586c9 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanConfiguration.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class VulkanConfiguration + { + public const bool UseFastBufferUpdates = true; + public const bool UseUnsafeBlit = true; + public const bool UsePushDescriptors = true; + + public const bool ForceD24S8Unsupported = false; + public const bool ForceRGB16IntFloatUnsupported = false; + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanDebugMessenger.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanDebugMessenger.cs new file mode 100644 index 000000000..28c8982e9 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanDebugMessenger.cs @@ -0,0 +1,133 @@ +using Ryujinx.Common.Configuration; +using Ryujinx.Common.Logging; +using Ryujinx.Common.Utilities; +using Silk.NET.Vulkan; +using Silk.NET.Vulkan.Extensions.EXT; +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class VulkanDebugMessenger : IDisposable + { + private readonly Vk _api; + private readonly Instance _instance; + private readonly GraphicsDebugLevel _logLevel; + private readonly ExtDebugUtils _debugUtils; + private readonly DebugUtilsMessengerEXT? _debugUtilsMessenger; + private bool _disposed; + + public VulkanDebugMessenger(Vk api, Instance instance, GraphicsDebugLevel logLevel) + { + _api = api; + _instance = instance; + _logLevel = logLevel; + + _api.TryGetInstanceExtension(instance, out _debugUtils); + + Result result = TryInitialize(out _debugUtilsMessenger); + + if (result != Result.Success) + { + Logger.Error?.Print(LogClass.Gpu, $"Vulkan debug messenger initialization failed with error {result}"); + } + } + + private Result TryInitialize(out DebugUtilsMessengerEXT? debugUtilsMessengerHandle) + { + debugUtilsMessengerHandle = null; + + if (_debugUtils != null && _logLevel != GraphicsDebugLevel.None) + { + var messageType = _logLevel switch + { + GraphicsDebugLevel.Error => DebugUtilsMessageTypeFlagsEXT.ValidationBitExt, + GraphicsDebugLevel.Slowdowns => DebugUtilsMessageTypeFlagsEXT.ValidationBitExt | + DebugUtilsMessageTypeFlagsEXT.PerformanceBitExt, + GraphicsDebugLevel.All => DebugUtilsMessageTypeFlagsEXT.GeneralBitExt | + DebugUtilsMessageTypeFlagsEXT.ValidationBitExt | + DebugUtilsMessageTypeFlagsEXT.PerformanceBitExt, + _ => throw new ArgumentException($"Invalid log level \"{_logLevel}\"."), + }; + + var messageSeverity = _logLevel switch + { + GraphicsDebugLevel.Error => DebugUtilsMessageSeverityFlagsEXT.ErrorBitExt, + GraphicsDebugLevel.Slowdowns => DebugUtilsMessageSeverityFlagsEXT.ErrorBitExt | + DebugUtilsMessageSeverityFlagsEXT.WarningBitExt, + GraphicsDebugLevel.All => DebugUtilsMessageSeverityFlagsEXT.InfoBitExt | + DebugUtilsMessageSeverityFlagsEXT.WarningBitExt | + DebugUtilsMessageSeverityFlagsEXT.VerboseBitExt | + DebugUtilsMessageSeverityFlagsEXT.ErrorBitExt, + _ => throw new ArgumentException($"Invalid log level \"{_logLevel}\"."), + }; + + var debugUtilsMessengerCreateInfo = new DebugUtilsMessengerCreateInfoEXT + { + SType = StructureType.DebugUtilsMessengerCreateInfoExt, + MessageType = messageType, + MessageSeverity = messageSeverity, + }; + + unsafe + { + debugUtilsMessengerCreateInfo.PfnUserCallback = new PfnDebugUtilsMessengerCallbackEXT(UserCallback); + } + + DebugUtilsMessengerEXT messengerHandle = default; + + Result result = _debugUtils.CreateDebugUtilsMessenger(_instance, SpanHelpers.AsReadOnlySpan(ref debugUtilsMessengerCreateInfo), ReadOnlySpan.Empty, SpanHelpers.AsSpan(ref messengerHandle)); + + if (result == Result.Success) + { + debugUtilsMessengerHandle = messengerHandle; + } + + return result; + } + + return Result.Success; + } + + private unsafe static uint UserCallback( + DebugUtilsMessageSeverityFlagsEXT messageSeverity, + DebugUtilsMessageTypeFlagsEXT messageTypes, + DebugUtilsMessengerCallbackDataEXT* pCallbackData, + void* pUserData) + { + var msg = Marshal.PtrToStringAnsi((nint)pCallbackData->PMessage); + + if (messageSeverity.HasFlag(DebugUtilsMessageSeverityFlagsEXT.ErrorBitExt)) + { + Logger.Error?.Print(LogClass.Gpu, msg); + } + else if (messageSeverity.HasFlag(DebugUtilsMessageSeverityFlagsEXT.WarningBitExt)) + { + Logger.Warning?.Print(LogClass.Gpu, msg); + } + else if (messageSeverity.HasFlag(DebugUtilsMessageSeverityFlagsEXT.InfoBitExt)) + { + Logger.Info?.Print(LogClass.Gpu, msg); + } + else // if (messageSeverity.HasFlag(DebugUtilsMessageSeverityFlagsEXT.VerboseBitExt)) + { + Logger.Debug?.Print(LogClass.Gpu, msg); + } + + return 0; + } + + public void Dispose() + { + if (!_disposed) + { + if (_debugUtilsMessenger.HasValue) + { + _debugUtils.DestroyDebugUtilsMessenger(_instance, _debugUtilsMessenger.Value, Span.Empty); + } + + _disposed = true; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanException.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanException.cs new file mode 100644 index 000000000..37b901182 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanException.cs @@ -0,0 +1,43 @@ +using Silk.NET.Vulkan; +using System; +using System.Runtime.Serialization; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + static class ResultExtensions + { + public static bool IsError(this Result result) + { + // Only negative result codes are errors. + return result < Result.Success; + } + + public static void ThrowOnError(this Result result) + { + // Only negative result codes are errors. + if (result.IsError()) + { + throw new VulkanException(result); + } + } + } + + class VulkanException : Exception + { + public VulkanException() + { + } + + public VulkanException(Result result) : base($"Unexpected API error \"{result}\".") + { + } + + public VulkanException(string message) : base(message) + { + } + + public VulkanException(string message, Exception innerException) : base(message, innerException) + { + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInitialization.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInitialization.cs new file mode 100644 index 000000000..ddaa28980 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInitialization.cs @@ -0,0 +1,618 @@ +using Ryujinx.Common.Configuration; +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using Silk.NET.Vulkan.Extensions.EXT; +using Silk.NET.Vulkan.Extensions.KHR; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + public unsafe static class VulkanInitialization + { + private const uint InvalidIndex = uint.MaxValue; + private static readonly uint _minimalVulkanVersion = Vk.Version11.Value; + private static readonly uint _minimalInstanceVulkanVersion = Vk.Version12.Value; + private static readonly uint _maximumVulkanVersion = Vk.Version12.Value; + private const string AppName = "Ryujinx.Graphics.Rdna3Vulkan"; + private const int QueuesCount = 2; + + private static readonly string[] _desirableExtensions = { + ExtConditionalRendering.ExtensionName, + ExtExtendedDynamicState.ExtensionName, + ExtTransformFeedback.ExtensionName, + KhrDrawIndirectCount.ExtensionName, + KhrPushDescriptor.ExtensionName, + ExtExternalMemoryHost.ExtensionName, + "VK_EXT_blend_operation_advanced", + "VK_EXT_custom_border_color", + "VK_EXT_descriptor_indexing", // Enabling this works around an issue with disposed buffer bindings on RADV. + "VK_EXT_fragment_shader_interlock", + "VK_EXT_index_type_uint8", + "VK_EXT_primitive_topology_list_restart", + "VK_EXT_robustness2", + "VK_EXT_shader_stencil_export", + "VK_KHR_shader_float16_int8", + "VK_EXT_shader_subgroup_ballot", + "VK_NV_geometry_shader_passthrough", + "VK_NV_viewport_array2", + "VK_EXT_depth_clip_control", + "VK_KHR_portability_subset", // As per spec, we should enable this if present. + "VK_EXT_4444_formats", + "VK_KHR_8bit_storage", + "VK_KHR_maintenance2", + "VK_EXT_attachment_feedback_loop_layout", + "VK_EXT_attachment_feedback_loop_dynamic_state", + }; + + private static readonly string[] _requiredExtensions = { + KhrSwapchain.ExtensionName, + }; + + internal static VulkanInstance CreateInstance(Vk api, GraphicsDebugLevel logLevel, string[] requiredExtensions) + { + var enabledLayers = new List(); + + var instanceExtensions = VulkanInstance.GetInstanceExtensions(api); + var instanceLayers = VulkanInstance.GetInstanceLayers(api); + + void AddAvailableLayer(string layerName) + { + if (instanceLayers.Contains(layerName)) + { + enabledLayers.Add(layerName); + } + else + { + Logger.Warning?.Print(LogClass.Gpu, $"Missing layer {layerName}"); + } + } + + if (logLevel != GraphicsDebugLevel.None) + { + AddAvailableLayer("VK_LAYER_KHRONOS_validation"); + } + + var enabledExtensions = requiredExtensions; + + if (instanceExtensions.Contains("VK_EXT_debug_utils")) + { + enabledExtensions = enabledExtensions.Append(ExtDebugUtils.ExtensionName).ToArray(); + } + + var appName = Marshal.StringToHGlobalAnsi(AppName); + + var applicationInfo = new ApplicationInfo + { + PApplicationName = (byte*)appName, + ApplicationVersion = 1, + PEngineName = (byte*)appName, + EngineVersion = 1, + ApiVersion = _maximumVulkanVersion, + }; + + nint* ppEnabledExtensions = stackalloc nint[enabledExtensions.Length]; + nint* ppEnabledLayers = stackalloc nint[enabledLayers.Count]; + + for (int i = 0; i < enabledExtensions.Length; i++) + { + ppEnabledExtensions[i] = Marshal.StringToHGlobalAnsi(enabledExtensions[i]); + } + + for (int i = 0; i < enabledLayers.Count; i++) + { + ppEnabledLayers[i] = Marshal.StringToHGlobalAnsi(enabledLayers[i]); + } + + var instanceCreateInfo = new InstanceCreateInfo + { + SType = StructureType.InstanceCreateInfo, + PApplicationInfo = &applicationInfo, + PpEnabledExtensionNames = (byte**)ppEnabledExtensions, + PpEnabledLayerNames = (byte**)ppEnabledLayers, + EnabledExtensionCount = (uint)enabledExtensions.Length, + EnabledLayerCount = (uint)enabledLayers.Count, + }; + + Result result = VulkanInstance.Create(api, ref instanceCreateInfo, out var instance); + + Marshal.FreeHGlobal(appName); + + for (int i = 0; i < enabledExtensions.Length; i++) + { + Marshal.FreeHGlobal(ppEnabledExtensions[i]); + } + + for (int i = 0; i < enabledLayers.Count; i++) + { + Marshal.FreeHGlobal(ppEnabledLayers[i]); + } + + result.ThrowOnError(); + + return instance; + } + + internal static VulkanPhysicalDevice FindSuitablePhysicalDevice(Vk api, VulkanInstance instance, SurfaceKHR surface, string preferredGpuId) + { + instance.EnumeratePhysicalDevices(out var physicalDevices).ThrowOnError(); + + // First we try to pick the user preferred GPU. + for (int i = 0; i < physicalDevices.Length; i++) + { + if (IsPreferredAndSuitableDevice(api, physicalDevices[i], surface, preferredGpuId)) + { + return physicalDevices[i]; + } + } + + // If we fail to do that, just use the first compatible GPU. + for (int i = 0; i < physicalDevices.Length; i++) + { + if (IsSuitableDevice(api, physicalDevices[i], surface)) + { + return physicalDevices[i]; + } + } + + throw new VulkanException("Initialization failed, none of the available GPUs meets the minimum requirements."); + } + + internal static DeviceInfo[] GetSuitablePhysicalDevices(Vk api) + { + var appName = Marshal.StringToHGlobalAnsi(AppName); + + var applicationInfo = new ApplicationInfo + { + PApplicationName = (byte*)appName, + ApplicationVersion = 1, + PEngineName = (byte*)appName, + EngineVersion = 1, + ApiVersion = _maximumVulkanVersion, + }; + + var instanceCreateInfo = new InstanceCreateInfo + { + SType = StructureType.InstanceCreateInfo, + PApplicationInfo = &applicationInfo, + PpEnabledExtensionNames = null, + PpEnabledLayerNames = null, + EnabledExtensionCount = 0, + EnabledLayerCount = 0, + }; + + Result result = VulkanInstance.Create(api, ref instanceCreateInfo, out var rawInstance); + + Marshal.FreeHGlobal(appName); + + result.ThrowOnError(); + + using VulkanInstance instance = rawInstance; + + // We currently assume that the instance is compatible with Vulkan 1.2 + // TODO: Remove this once we relax our initialization codepaths. + if (instance.InstanceVersion < _minimalInstanceVulkanVersion) + { + return Array.Empty(); + } + + instance.EnumeratePhysicalDevices(out VulkanPhysicalDevice[] physicalDevices).ThrowOnError(); + + List deviceInfos = new(); + + foreach (VulkanPhysicalDevice physicalDevice in physicalDevices) + { + if (physicalDevice.PhysicalDeviceProperties.ApiVersion < _minimalVulkanVersion) + { + continue; + } + + deviceInfos.Add(physicalDevice.ToDeviceInfo()); + } + + return deviceInfos.ToArray(); + } + + private static bool IsPreferredAndSuitableDevice(Vk api, VulkanPhysicalDevice physicalDevice, SurfaceKHR surface, string preferredGpuId) + { + if (physicalDevice.Id != preferredGpuId) + { + return false; + } + + return IsSuitableDevice(api, physicalDevice, surface); + } + + private static bool IsSuitableDevice(Vk api, VulkanPhysicalDevice physicalDevice, SurfaceKHR surface) + { + int extensionMatches = 0; + + foreach (string requiredExtension in _requiredExtensions) + { + if (physicalDevice.IsDeviceExtensionPresent(requiredExtension)) + { + extensionMatches++; + } + } + + return extensionMatches == _requiredExtensions.Length && FindSuitableQueueFamily(api, physicalDevice, surface, out _) != InvalidIndex; + } + + internal static uint FindSuitableQueueFamily(Vk api, VulkanPhysicalDevice physicalDevice, SurfaceKHR surface, out uint queueCount) + { + const QueueFlags RequiredFlags = QueueFlags.GraphicsBit | QueueFlags.ComputeBit; + + var khrSurface = new KhrSurface(api.Context); + + for (uint index = 0; index < physicalDevice.QueueFamilyProperties.Length; index++) + { + ref QueueFamilyProperties property = ref physicalDevice.QueueFamilyProperties[index]; + + khrSurface.GetPhysicalDeviceSurfaceSupport(physicalDevice.PhysicalDevice, index, surface, out var surfaceSupported).ThrowOnError(); + + if (property.QueueFlags.HasFlag(RequiredFlags) && surfaceSupported) + { + queueCount = property.QueueCount; + + return index; + } + } + + queueCount = 0; + + return InvalidIndex; + } + + internal static Device CreateDevice(Vk api, VulkanPhysicalDevice physicalDevice, uint queueFamilyIndex, uint queueCount) + { + if (queueCount > QueuesCount) + { + queueCount = QueuesCount; + } + + float* queuePriorities = stackalloc float[(int)queueCount]; + + for (int i = 0; i < queueCount; i++) + { + queuePriorities[i] = 1f; + } + + var queueCreateInfo = new DeviceQueueCreateInfo + { + SType = StructureType.DeviceQueueCreateInfo, + QueueFamilyIndex = queueFamilyIndex, + QueueCount = queueCount, + PQueuePriorities = queuePriorities, + }; + + bool useRobustBufferAccess = VendorUtils.FromId(physicalDevice.PhysicalDeviceProperties.VendorID) == Vendor.Nvidia; + + PhysicalDeviceFeatures2 features2 = new() + { + SType = StructureType.PhysicalDeviceFeatures2, + }; + + PhysicalDeviceVulkan11Features supportedFeaturesVk11 = new() + { + SType = StructureType.PhysicalDeviceVulkan11Features, + PNext = features2.PNext, + }; + + features2.PNext = &supportedFeaturesVk11; + + PhysicalDeviceCustomBorderColorFeaturesEXT supportedFeaturesCustomBorderColor = new() + { + SType = StructureType.PhysicalDeviceCustomBorderColorFeaturesExt, + PNext = features2.PNext, + }; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_custom_border_color")) + { + features2.PNext = &supportedFeaturesCustomBorderColor; + } + + PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT supportedFeaturesPrimitiveTopologyListRestart = new() + { + SType = StructureType.PhysicalDevicePrimitiveTopologyListRestartFeaturesExt, + PNext = features2.PNext, + }; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_primitive_topology_list_restart")) + { + features2.PNext = &supportedFeaturesPrimitiveTopologyListRestart; + } + + PhysicalDeviceTransformFeedbackFeaturesEXT supportedFeaturesTransformFeedback = new() + { + SType = StructureType.PhysicalDeviceTransformFeedbackFeaturesExt, + PNext = features2.PNext, + }; + + if (physicalDevice.IsDeviceExtensionPresent(ExtTransformFeedback.ExtensionName)) + { + features2.PNext = &supportedFeaturesTransformFeedback; + } + + PhysicalDeviceRobustness2FeaturesEXT supportedFeaturesRobustness2 = new() + { + SType = StructureType.PhysicalDeviceRobustness2FeaturesExt, + }; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_robustness2")) + { + supportedFeaturesRobustness2.PNext = features2.PNext; + + features2.PNext = &supportedFeaturesRobustness2; + } + + PhysicalDeviceDepthClipControlFeaturesEXT supportedFeaturesDepthClipControl = new() + { + SType = StructureType.PhysicalDeviceDepthClipControlFeaturesExt, + PNext = features2.PNext, + }; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_depth_clip_control")) + { + features2.PNext = &supportedFeaturesDepthClipControl; + } + + PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT supportedFeaturesAttachmentFeedbackLoopLayout = new() + { + SType = StructureType.PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesExt, + PNext = features2.PNext, + }; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_attachment_feedback_loop_layout")) + { + features2.PNext = &supportedFeaturesAttachmentFeedbackLoopLayout; + } + + PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT supportedFeaturesDynamicAttachmentFeedbackLoopLayout = new() + { + SType = StructureType.PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesExt, + PNext = features2.PNext, + }; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_attachment_feedback_loop_dynamic_state")) + { + features2.PNext = &supportedFeaturesDynamicAttachmentFeedbackLoopLayout; + } + + PhysicalDeviceVulkan12Features supportedPhysicalDeviceVulkan12Features = new() + { + SType = StructureType.PhysicalDeviceVulkan12Features, + PNext = features2.PNext, + }; + + features2.PNext = &supportedPhysicalDeviceVulkan12Features; + + api.GetPhysicalDeviceFeatures2(physicalDevice.PhysicalDevice, &features2); + + var supportedFeatures = features2.Features; + + var features = new PhysicalDeviceFeatures + { + DepthBiasClamp = supportedFeatures.DepthBiasClamp, + DepthClamp = supportedFeatures.DepthClamp, + DualSrcBlend = supportedFeatures.DualSrcBlend, + FragmentStoresAndAtomics = supportedFeatures.FragmentStoresAndAtomics, + GeometryShader = supportedFeatures.GeometryShader, + ImageCubeArray = supportedFeatures.ImageCubeArray, + IndependentBlend = supportedFeatures.IndependentBlend, + LogicOp = supportedFeatures.LogicOp, + OcclusionQueryPrecise = supportedFeatures.OcclusionQueryPrecise, + MultiViewport = supportedFeatures.MultiViewport, + PipelineStatisticsQuery = supportedFeatures.PipelineStatisticsQuery, + SamplerAnisotropy = supportedFeatures.SamplerAnisotropy, + ShaderClipDistance = supportedFeatures.ShaderClipDistance, + ShaderFloat64 = supportedFeatures.ShaderFloat64, + ShaderImageGatherExtended = supportedFeatures.ShaderImageGatherExtended, + ShaderStorageImageMultisample = supportedFeatures.ShaderStorageImageMultisample, + ShaderStorageImageReadWithoutFormat = supportedFeatures.ShaderStorageImageReadWithoutFormat, + ShaderStorageImageWriteWithoutFormat = supportedFeatures.ShaderStorageImageWriteWithoutFormat, + TessellationShader = supportedFeatures.TessellationShader, + VertexPipelineStoresAndAtomics = supportedFeatures.VertexPipelineStoresAndAtomics, + RobustBufferAccess = useRobustBufferAccess, + SampleRateShading = supportedFeatures.SampleRateShading, + }; + + void* pExtendedFeatures = null; + + PhysicalDeviceTransformFeedbackFeaturesEXT featuresTransformFeedback; + + if (physicalDevice.IsDeviceExtensionPresent(ExtTransformFeedback.ExtensionName)) + { + featuresTransformFeedback = new PhysicalDeviceTransformFeedbackFeaturesEXT + { + SType = StructureType.PhysicalDeviceTransformFeedbackFeaturesExt, + PNext = pExtendedFeatures, + TransformFeedback = supportedFeaturesTransformFeedback.TransformFeedback, + }; + + pExtendedFeatures = &featuresTransformFeedback; + } + + PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT featuresPrimitiveTopologyListRestart; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_primitive_topology_list_restart")) + { + featuresPrimitiveTopologyListRestart = new PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT + { + SType = StructureType.PhysicalDevicePrimitiveTopologyListRestartFeaturesExt, + PNext = pExtendedFeatures, + PrimitiveTopologyListRestart = supportedFeaturesPrimitiveTopologyListRestart.PrimitiveTopologyListRestart, + PrimitiveTopologyPatchListRestart = supportedFeaturesPrimitiveTopologyListRestart.PrimitiveTopologyPatchListRestart, + }; + + pExtendedFeatures = &featuresPrimitiveTopologyListRestart; + } + + PhysicalDeviceRobustness2FeaturesEXT featuresRobustness2; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_robustness2")) + { + featuresRobustness2 = new PhysicalDeviceRobustness2FeaturesEXT + { + SType = StructureType.PhysicalDeviceRobustness2FeaturesExt, + PNext = pExtendedFeatures, + NullDescriptor = supportedFeaturesRobustness2.NullDescriptor, + }; + + pExtendedFeatures = &featuresRobustness2; + } + + var featuresExtendedDynamicState = new PhysicalDeviceExtendedDynamicStateFeaturesEXT + { + SType = StructureType.PhysicalDeviceExtendedDynamicStateFeaturesExt, + PNext = pExtendedFeatures, + ExtendedDynamicState = physicalDevice.IsDeviceExtensionPresent(ExtExtendedDynamicState.ExtensionName), + }; + + pExtendedFeatures = &featuresExtendedDynamicState; + + var featuresVk11 = new PhysicalDeviceVulkan11Features + { + SType = StructureType.PhysicalDeviceVulkan11Features, + PNext = pExtendedFeatures, + ShaderDrawParameters = supportedFeaturesVk11.ShaderDrawParameters, + }; + + pExtendedFeatures = &featuresVk11; + + var featuresVk12 = new PhysicalDeviceVulkan12Features + { + SType = StructureType.PhysicalDeviceVulkan12Features, + PNext = pExtendedFeatures, + DescriptorIndexing = supportedPhysicalDeviceVulkan12Features.DescriptorIndexing, + DrawIndirectCount = supportedPhysicalDeviceVulkan12Features.DrawIndirectCount, + UniformBufferStandardLayout = supportedPhysicalDeviceVulkan12Features.UniformBufferStandardLayout, + UniformAndStorageBuffer8BitAccess = supportedPhysicalDeviceVulkan12Features.UniformAndStorageBuffer8BitAccess, + StorageBuffer8BitAccess = supportedPhysicalDeviceVulkan12Features.StorageBuffer8BitAccess, + }; + + pExtendedFeatures = &featuresVk12; + + PhysicalDeviceIndexTypeUint8FeaturesEXT featuresIndexU8; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_index_type_uint8")) + { + featuresIndexU8 = new PhysicalDeviceIndexTypeUint8FeaturesEXT + { + SType = StructureType.PhysicalDeviceIndexTypeUint8FeaturesExt, + PNext = pExtendedFeatures, + IndexTypeUint8 = true, + }; + + pExtendedFeatures = &featuresIndexU8; + } + + PhysicalDeviceFragmentShaderInterlockFeaturesEXT featuresFragmentShaderInterlock; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_fragment_shader_interlock")) + { + featuresFragmentShaderInterlock = new PhysicalDeviceFragmentShaderInterlockFeaturesEXT + { + SType = StructureType.PhysicalDeviceFragmentShaderInterlockFeaturesExt, + PNext = pExtendedFeatures, + FragmentShaderPixelInterlock = true, + }; + + pExtendedFeatures = &featuresFragmentShaderInterlock; + } + + PhysicalDeviceCustomBorderColorFeaturesEXT featuresCustomBorderColor; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_custom_border_color") && + supportedFeaturesCustomBorderColor.CustomBorderColors && + supportedFeaturesCustomBorderColor.CustomBorderColorWithoutFormat) + { + featuresCustomBorderColor = new PhysicalDeviceCustomBorderColorFeaturesEXT + { + SType = StructureType.PhysicalDeviceCustomBorderColorFeaturesExt, + PNext = pExtendedFeatures, + CustomBorderColors = true, + CustomBorderColorWithoutFormat = true, + }; + + pExtendedFeatures = &featuresCustomBorderColor; + } + + PhysicalDeviceDepthClipControlFeaturesEXT featuresDepthClipControl; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_depth_clip_control") && + supportedFeaturesDepthClipControl.DepthClipControl) + { + featuresDepthClipControl = new PhysicalDeviceDepthClipControlFeaturesEXT + { + SType = StructureType.PhysicalDeviceDepthClipControlFeaturesExt, + PNext = pExtendedFeatures, + DepthClipControl = true, + }; + + pExtendedFeatures = &featuresDepthClipControl; + } + + PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT featuresAttachmentFeedbackLoopLayout; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_attachment_feedback_loop_layout") && + supportedFeaturesAttachmentFeedbackLoopLayout.AttachmentFeedbackLoopLayout) + { + featuresAttachmentFeedbackLoopLayout = new() + { + SType = StructureType.PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesExt, + PNext = pExtendedFeatures, + AttachmentFeedbackLoopLayout = true, + }; + + pExtendedFeatures = &featuresAttachmentFeedbackLoopLayout; + } + + PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT featuresDynamicAttachmentFeedbackLoopLayout; + + if (physicalDevice.IsDeviceExtensionPresent("VK_EXT_attachment_feedback_loop_dynamic_state") && + supportedFeaturesDynamicAttachmentFeedbackLoopLayout.AttachmentFeedbackLoopDynamicState) + { + featuresDynamicAttachmentFeedbackLoopLayout = new() + { + SType = StructureType.PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesExt, + PNext = pExtendedFeatures, + AttachmentFeedbackLoopDynamicState = true, + }; + + pExtendedFeatures = &featuresDynamicAttachmentFeedbackLoopLayout; + } + + var enabledExtensions = _requiredExtensions.Union(_desirableExtensions.Intersect(physicalDevice.DeviceExtensions)).ToArray(); + + nint* ppEnabledExtensions = stackalloc nint[enabledExtensions.Length]; + + for (int i = 0; i < enabledExtensions.Length; i++) + { + ppEnabledExtensions[i] = Marshal.StringToHGlobalAnsi(enabledExtensions[i]); + } + + var deviceCreateInfo = new DeviceCreateInfo + { + SType = StructureType.DeviceCreateInfo, + PNext = pExtendedFeatures, + QueueCreateInfoCount = 1, + PQueueCreateInfos = &queueCreateInfo, + PpEnabledExtensionNames = (byte**)ppEnabledExtensions, + EnabledExtensionCount = (uint)enabledExtensions.Length, + PEnabledFeatures = &features, + }; + + api.CreateDevice(physicalDevice.PhysicalDevice, in deviceCreateInfo, null, out var device).ThrowOnError(); + + for (int i = 0; i < enabledExtensions.Length; i++) + { + Marshal.FreeHGlobal(ppEnabledExtensions[i]); + } + + return device; + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInstance.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInstance.cs new file mode 100644 index 000000000..860b197f0 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanInstance.cs @@ -0,0 +1,127 @@ +using Ryujinx.Common.Utilities; +using Silk.NET.Core; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class VulkanInstance : IDisposable + { + private readonly Vk _api; + public readonly Instance Instance; + public readonly Version32 InstanceVersion; + + private bool _disposed; + + private VulkanInstance(Vk api, Instance instance) + { + _api = api; + Instance = instance; + + if (api.GetInstanceProcAddr(instance, "vkEnumerateInstanceVersion") == nint.Zero) + { + InstanceVersion = Vk.Version10; + } + else + { + uint rawInstanceVersion = 0; + + if (api.EnumerateInstanceVersion(ref rawInstanceVersion) != Result.Success) + { + rawInstanceVersion = Vk.Version11.Value; + } + + InstanceVersion = (Version32)rawInstanceVersion; + } + } + + public static Result Create(Vk api, ref InstanceCreateInfo createInfo, out VulkanInstance instance) + { + instance = null; + + Instance rawInstance = default; + + Result result = api.CreateInstance(SpanHelpers.AsReadOnlySpan(ref createInfo), ReadOnlySpan.Empty, SpanHelpers.AsSpan(ref rawInstance)); + + if (result == Result.Success) + { + instance = new VulkanInstance(api, rawInstance); + } + + return result; + } + + public Result EnumeratePhysicalDevices(out VulkanPhysicalDevice[] physicalDevices) + { + physicalDevices = null; + + uint physicalDeviceCount = 0; + + Result result = _api.EnumeratePhysicalDevices(Instance, SpanHelpers.AsSpan(ref physicalDeviceCount), Span.Empty); + + if (result != Result.Success) + { + return result; + } + + PhysicalDevice[] rawPhysicalDevices = new PhysicalDevice[physicalDeviceCount]; + + result = _api.EnumeratePhysicalDevices(Instance, SpanHelpers.AsSpan(ref physicalDeviceCount), rawPhysicalDevices); + + if (result != Result.Success) + { + return result; + } + + physicalDevices = rawPhysicalDevices.Select(x => new VulkanPhysicalDevice(_api, x)).ToArray(); + + return Result.Success; + } + + public static IReadOnlySet GetInstanceExtensions(Vk api) + { + uint propertiesCount = 0; + + api.EnumerateInstanceExtensionProperties(ReadOnlySpan.Empty, SpanHelpers.AsSpan(ref propertiesCount), Span.Empty).ThrowOnError(); + + ExtensionProperties[] extensionProperties = new ExtensionProperties[propertiesCount]; + + api.EnumerateInstanceExtensionProperties(ReadOnlySpan.Empty, SpanHelpers.AsSpan(ref propertiesCount), extensionProperties).ThrowOnError(); + + unsafe + { + return extensionProperties.Select(x => Marshal.PtrToStringAnsi((nint)x.ExtensionName)).ToImmutableHashSet(); + } + } + + public static IReadOnlySet GetInstanceLayers(Vk api) + { + uint propertiesCount = 0; + + api.EnumerateInstanceLayerProperties(SpanHelpers.AsSpan(ref propertiesCount), Span.Empty).ThrowOnError(); + + LayerProperties[] layerProperties = new LayerProperties[propertiesCount]; + + api.EnumerateInstanceLayerProperties(SpanHelpers.AsSpan(ref propertiesCount), layerProperties).ThrowOnError(); + + unsafe + { + return layerProperties.Select(x => Marshal.PtrToStringAnsi((nint)x.LayerName)).ToImmutableHashSet(); + } + } + + public void Dispose() + { + if (!_disposed) + { + _api.DestroyInstance(Instance, ReadOnlySpan.Empty); + + _disposed = true; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanPhysicalDevice.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanPhysicalDevice.cs new file mode 100644 index 000000000..cc09147fa --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanPhysicalDevice.cs @@ -0,0 +1,97 @@ +using Ryujinx.Common.Utilities; +using Ryujinx.Graphics.GAL; +using Silk.NET.Vulkan; +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + readonly struct VulkanPhysicalDevice + { + public readonly PhysicalDevice PhysicalDevice; + public readonly PhysicalDeviceFeatures PhysicalDeviceFeatures; + public readonly PhysicalDeviceProperties PhysicalDeviceProperties; + public readonly PhysicalDeviceMemoryProperties PhysicalDeviceMemoryProperties; + public readonly QueueFamilyProperties[] QueueFamilyProperties; + public readonly string DeviceName; + public readonly IReadOnlySet DeviceExtensions; + + public VulkanPhysicalDevice(Vk api, PhysicalDevice physicalDevice) + { + PhysicalDevice = physicalDevice; + PhysicalDeviceFeatures = api.GetPhysicalDeviceFeature(PhysicalDevice); + + api.GetPhysicalDeviceProperties(PhysicalDevice, out var physicalDeviceProperties); + PhysicalDeviceProperties = physicalDeviceProperties; + + api.GetPhysicalDeviceMemoryProperties(PhysicalDevice, out PhysicalDeviceMemoryProperties); + + unsafe + { + DeviceName = Marshal.PtrToStringAnsi((nint)physicalDeviceProperties.DeviceName); + } + + uint propertiesCount = 0; + + api.GetPhysicalDeviceQueueFamilyProperties(physicalDevice, SpanHelpers.AsSpan(ref propertiesCount), Span.Empty); + + QueueFamilyProperties = new QueueFamilyProperties[propertiesCount]; + + api.GetPhysicalDeviceQueueFamilyProperties(physicalDevice, SpanHelpers.AsSpan(ref propertiesCount), QueueFamilyProperties); + + api.EnumerateDeviceExtensionProperties(PhysicalDevice, Span.Empty, SpanHelpers.AsSpan(ref propertiesCount), Span.Empty).ThrowOnError(); + + ExtensionProperties[] extensionProperties = new ExtensionProperties[propertiesCount]; + + api.EnumerateDeviceExtensionProperties(PhysicalDevice, Span.Empty, SpanHelpers.AsSpan(ref propertiesCount), extensionProperties).ThrowOnError(); + + unsafe + { + DeviceExtensions = extensionProperties.Select(x => Marshal.PtrToStringAnsi((nint)x.ExtensionName)).ToImmutableHashSet(); + } + } + + public string Id => $"0x{PhysicalDeviceProperties.VendorID:X}_0x{PhysicalDeviceProperties.DeviceID:X}"; + + public bool IsDeviceExtensionPresent(string extension) => DeviceExtensions.Contains(extension); + + public unsafe bool TryGetPhysicalDeviceDriverPropertiesKHR(Vk api, out PhysicalDeviceDriverPropertiesKHR res) + { + if (!IsDeviceExtensionPresent("VK_KHR_driver_properties")) + { + res = default; + + return false; + } + + PhysicalDeviceDriverPropertiesKHR physicalDeviceDriverProperties = new() + { + SType = StructureType.PhysicalDeviceDriverPropertiesKhr + }; + + PhysicalDeviceProperties2 physicalDeviceProperties2 = new() + { + SType = StructureType.PhysicalDeviceProperties2, + PNext = &physicalDeviceDriverProperties + }; + + api.GetPhysicalDeviceProperties2(PhysicalDevice, &physicalDeviceProperties2); + + res = physicalDeviceDriverProperties; + + return true; + } + + public DeviceInfo ToDeviceInfo() + { + return new DeviceInfo( + Id, + VendorUtils.GetNameFromId(PhysicalDeviceProperties.VendorID), + DeviceName, + PhysicalDeviceProperties.DeviceType == PhysicalDeviceType.DiscreteGpu); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanRenderer.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanRenderer.cs new file mode 100644 index 000000000..6ffe1f59b --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/VulkanRenderer.cs @@ -0,0 +1,1059 @@ +using Gommon; +using Ryujinx.Common.Configuration; +using Ryujinx.Common.Logging; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Ryujinx.Graphics.Rdna3Vulkan.Queries; +using Silk.NET.Vulkan; +using Silk.NET.Vulkan.Extensions.EXT; +using Silk.NET.Vulkan.Extensions.KHR; +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Threading; +using Format = Ryujinx.Graphics.GAL.Format; +using PrimitiveTopology = Ryujinx.Graphics.GAL.PrimitiveTopology; +using SamplerCreateInfo = Ryujinx.Graphics.GAL.SamplerCreateInfo; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + public sealed class VulkanRenderer : IRenderer + { + private VulkanInstance _instance; + private SurfaceKHR _surface; + private VulkanPhysicalDevice _physicalDevice; + private Device _device; + private WindowBase _window; + + private bool _initialized; + + public uint ProgramCount { get; set; } = 0; + + internal FormatCapabilities FormatCapabilities { get; private set; } + internal HardwareCapabilities Capabilities; + + internal Vk Api { get; private set; } + internal KhrSurface SurfaceApi { get; private set; } + internal KhrSwapchain SwapchainApi { get; private set; } + internal ExtConditionalRendering ConditionalRenderingApi { get; private set; } + internal ExtExtendedDynamicState ExtendedDynamicStateApi { get; private set; } + internal KhrPushDescriptor PushDescriptorApi { get; private set; } + internal ExtTransformFeedback TransformFeedbackApi { get; private set; } + internal KhrDrawIndirectCount DrawIndirectCountApi { get; private set; } + internal ExtAttachmentFeedbackLoopDynamicState DynamicFeedbackLoopApi { get; private set; } + + internal uint QueueFamilyIndex { get; private set; } + internal Queue Queue { get; private set; } + internal Queue BackgroundQueue { get; private set; } + internal Lock BackgroundQueueLock { get; private set; } + internal Lock QueueLock { get; private set; } + + internal MemoryAllocator MemoryAllocator { get; private set; } + internal HostMemoryAllocator HostMemoryAllocator { get; private set; } + internal CommandBufferPool CommandBufferPool { get; private set; } + internal PipelineLayoutCache PipelineLayoutCache { get; private set; } + internal BackgroundResources BackgroundResources { get; private set; } + internal Action InterruptAction { get; private set; } + internal SyncManager SyncManager { get; private set; } + + internal BufferManager BufferManager { get; private set; } + + internal HashSet Shaders { get; } + internal HashSet Textures { get; } + internal HashSet Samplers { get; } + + private VulkanDebugMessenger _debugMessenger; + private Counters _counters; + + private PipelineFull _pipeline; + + internal HelperShader HelperShader { get; private set; } + internal PipelineFull PipelineInternal => _pipeline; + + internal BarrierBatch Barriers { get; private set; } + + public IPipeline Pipeline => _pipeline; + + public IWindow Window => _window; + + private readonly Func _getSurface; + private readonly Func _getRequiredExtensions; + private readonly string _preferredGpuId; + + private int[] _pdReservedBindings; + private readonly static int[] _pdReservedBindingsNvn = { 3, 18, 21, 36, 30 }; + private readonly static int[] _pdReservedBindingsOgl = { 17, 18, 34, 35, 36 }; + + internal Vendor Vendor { get; private set; } + internal bool IsAmdWindows { get; private set; } + internal bool IsIntelWindows { get; private set; } + internal bool IsAmdGcn { get; private set; } + internal bool IsNvidiaPreTuring { get; private set; } + internal bool IsIntelArc { get; private set; } + internal bool IsQualcommProprietary { get; private set; } + internal bool IsMoltenVk { get; private set; } + internal bool IsTBDR { get; private set; } + internal bool IsSharedMemory { get; private set; } + + public string GpuVendor { get; private set; } + public string GpuDriver { get; private set; } + public string GpuRenderer { get; private set; } + public string GpuVersion { get; private set; } + + public bool PreferThreading => true; + + public event EventHandler ScreenCaptured; + + public VulkanRenderer(Vk api, Func getSurface, Func requiredExtensionsFunc, string preferredGpuId) + { + _getSurface = getSurface; + _getRequiredExtensions = requiredExtensionsFunc; + _preferredGpuId = preferredGpuId; + Api = api; + Shaders = []; + Textures = []; + Samplers = []; + } + + public static VulkanRenderer Create( + string preferredGpuId, + Func getSurface, + Func getRequiredExtensions + ) => new(Vk.GetApi(), getSurface, getRequiredExtensions, preferredGpuId); + + private unsafe void LoadFeatures(uint maxQueueCount, uint queueFamilyIndex) + { + FormatCapabilities = new FormatCapabilities(Api, _physicalDevice.PhysicalDevice); + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out ExtConditionalRendering conditionalRenderingApi)) + { + ConditionalRenderingApi = conditionalRenderingApi; + } + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out ExtExtendedDynamicState extendedDynamicStateApi)) + { + ExtendedDynamicStateApi = extendedDynamicStateApi; + } + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out KhrPushDescriptor pushDescriptorApi)) + { + PushDescriptorApi = pushDescriptorApi; + } + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out ExtTransformFeedback transformFeedbackApi)) + { + TransformFeedbackApi = transformFeedbackApi; + } + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out KhrDrawIndirectCount drawIndirectCountApi)) + { + DrawIndirectCountApi = drawIndirectCountApi; + } + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out ExtAttachmentFeedbackLoopDynamicState dynamicFeedbackLoopApi)) + { + DynamicFeedbackLoopApi = dynamicFeedbackLoopApi; + } + + if (maxQueueCount >= 2) + { + Api.GetDeviceQueue(_device, queueFamilyIndex, 1, out var backgroundQueue); + BackgroundQueue = backgroundQueue; + BackgroundQueueLock = new(); + } + + PhysicalDeviceProperties2 properties2 = new() + { + SType = StructureType.PhysicalDeviceProperties2, + }; + + PhysicalDeviceSubgroupProperties propertiesSubgroup = new() + { + SType = StructureType.PhysicalDeviceSubgroupProperties, + PNext = properties2.PNext, + }; + + properties2.PNext = &propertiesSubgroup; + + PhysicalDeviceBlendOperationAdvancedPropertiesEXT propertiesBlendOperationAdvanced = new() + { + SType = StructureType.PhysicalDeviceBlendOperationAdvancedPropertiesExt, + }; + + bool supportsBlendOperationAdvanced = _physicalDevice.IsDeviceExtensionPresent("VK_EXT_blend_operation_advanced"); + + if (supportsBlendOperationAdvanced) + { + propertiesBlendOperationAdvanced.PNext = properties2.PNext; + properties2.PNext = &propertiesBlendOperationAdvanced; + } + + bool supportsTransformFeedback = _physicalDevice.IsDeviceExtensionPresent(ExtTransformFeedback.ExtensionName); + + PhysicalDeviceTransformFeedbackPropertiesEXT propertiesTransformFeedback = new() + { + SType = StructureType.PhysicalDeviceTransformFeedbackPropertiesExt, + }; + + if (supportsTransformFeedback) + { + propertiesTransformFeedback.PNext = properties2.PNext; + properties2.PNext = &propertiesTransformFeedback; + } + + PhysicalDevicePortabilitySubsetPropertiesKHR propertiesPortabilitySubset = new() + { + SType = StructureType.PhysicalDevicePortabilitySubsetPropertiesKhr, + }; + + bool supportsPushDescriptors = _physicalDevice.IsDeviceExtensionPresent(KhrPushDescriptor.ExtensionName); + + PhysicalDevicePushDescriptorPropertiesKHR propertiesPushDescriptor = new PhysicalDevicePushDescriptorPropertiesKHR() + { + SType = StructureType.PhysicalDevicePushDescriptorPropertiesKhr + }; + + if (supportsPushDescriptors) + { + propertiesPushDescriptor.PNext = properties2.PNext; + properties2.PNext = &propertiesPushDescriptor; + } + + PhysicalDeviceFeatures2 features2 = new() + { + SType = StructureType.PhysicalDeviceFeatures2, + }; + + PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT featuresPrimitiveTopologyListRestart = new() + { + SType = StructureType.PhysicalDevicePrimitiveTopologyListRestartFeaturesExt, + }; + + PhysicalDeviceRobustness2FeaturesEXT featuresRobustness2 = new() + { + SType = StructureType.PhysicalDeviceRobustness2FeaturesExt, + }; + + PhysicalDeviceShaderFloat16Int8FeaturesKHR featuresShaderInt8 = new() + { + SType = StructureType.PhysicalDeviceShaderFloat16Int8Features, + }; + + PhysicalDeviceCustomBorderColorFeaturesEXT featuresCustomBorderColor = new() + { + SType = StructureType.PhysicalDeviceCustomBorderColorFeaturesExt, + }; + + PhysicalDeviceDepthClipControlFeaturesEXT featuresDepthClipControl = new() + { + SType = StructureType.PhysicalDeviceDepthClipControlFeaturesExt, + }; + + PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT featuresAttachmentFeedbackLoop = new() + { + SType = StructureType.PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesExt, + }; + + PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT featuresDynamicAttachmentFeedbackLoop = new() + { + SType = StructureType.PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesExt, + }; + + PhysicalDevicePortabilitySubsetFeaturesKHR featuresPortabilitySubset = new() + { + SType = StructureType.PhysicalDevicePortabilitySubsetFeaturesKhr, + }; + + if (_physicalDevice.IsDeviceExtensionPresent("VK_EXT_primitive_topology_list_restart")) + { + features2.PNext = &featuresPrimitiveTopologyListRestart; + } + + if (_physicalDevice.IsDeviceExtensionPresent("VK_EXT_robustness2")) + { + featuresRobustness2.PNext = features2.PNext; + features2.PNext = &featuresRobustness2; + } + + if (_physicalDevice.IsDeviceExtensionPresent("VK_KHR_shader_float16_int8")) + { + featuresShaderInt8.PNext = features2.PNext; + features2.PNext = &featuresShaderInt8; + } + + if (_physicalDevice.IsDeviceExtensionPresent("VK_EXT_custom_border_color")) + { + featuresCustomBorderColor.PNext = features2.PNext; + features2.PNext = &featuresCustomBorderColor; + } + + bool supportsDepthClipControl = _physicalDevice.IsDeviceExtensionPresent("VK_EXT_depth_clip_control"); + + if (supportsDepthClipControl) + { + featuresDepthClipControl.PNext = features2.PNext; + features2.PNext = &featuresDepthClipControl; + } + + bool supportsAttachmentFeedbackLoop = _physicalDevice.IsDeviceExtensionPresent("VK_EXT_attachment_feedback_loop_layout"); + + if (supportsAttachmentFeedbackLoop) + { + featuresAttachmentFeedbackLoop.PNext = features2.PNext; + features2.PNext = &featuresAttachmentFeedbackLoop; + } + + bool supportsDynamicAttachmentFeedbackLoop = _physicalDevice.IsDeviceExtensionPresent("VK_EXT_attachment_feedback_loop_dynamic_state"); + + if (supportsDynamicAttachmentFeedbackLoop) + { + featuresDynamicAttachmentFeedbackLoop.PNext = features2.PNext; + features2.PNext = &featuresDynamicAttachmentFeedbackLoop; + } + + bool usePortability = _physicalDevice.IsDeviceExtensionPresent("VK_KHR_portability_subset"); + + if (usePortability) + { + propertiesPortabilitySubset.PNext = properties2.PNext; + properties2.PNext = &propertiesPortabilitySubset; + + featuresPortabilitySubset.PNext = features2.PNext; + features2.PNext = &featuresPortabilitySubset; + } + + Api.GetPhysicalDeviceProperties2(_physicalDevice.PhysicalDevice, &properties2); + Api.GetPhysicalDeviceFeatures2(_physicalDevice.PhysicalDevice, &features2); + + var portabilityFlags = PortabilitySubsetFlags.None; + uint vertexBufferAlignment = 1; + + if (usePortability) + { + vertexBufferAlignment = propertiesPortabilitySubset.MinVertexInputBindingStrideAlignment; + + portabilityFlags |= featuresPortabilitySubset.TriangleFans ? 0 : PortabilitySubsetFlags.NoTriangleFans; + portabilityFlags |= featuresPortabilitySubset.PointPolygons ? 0 : PortabilitySubsetFlags.NoPointMode; + portabilityFlags |= featuresPortabilitySubset.ImageView2DOn3DImage ? 0 : PortabilitySubsetFlags.No3DImageView; + portabilityFlags |= featuresPortabilitySubset.SamplerMipLodBias ? 0 : PortabilitySubsetFlags.NoLodBias; + } + + bool supportsCustomBorderColor = _physicalDevice.IsDeviceExtensionPresent("VK_EXT_custom_border_color") && + featuresCustomBorderColor.CustomBorderColors && + featuresCustomBorderColor.CustomBorderColorWithoutFormat; + + ref var properties = ref properties2.Properties; + + var hasDriverProperties = _physicalDevice.TryGetPhysicalDeviceDriverPropertiesKHR(Api, out var driverProperties); + + Vendor = VendorUtils.FromId(properties.VendorID); + + IsAmdWindows = Vendor == Vendor.Amd && OperatingSystem.IsWindows(); + IsIntelWindows = Vendor == Vendor.Intel && OperatingSystem.IsWindows(); + IsTBDR = + Vendor == Vendor.Apple || + Vendor == Vendor.Qualcomm || + Vendor == Vendor.ARM || + Vendor == Vendor.Broadcom || + Vendor == Vendor.ImgTec; + + GpuVendor = VendorUtils.GetNameFromId(properties.VendorID); + GpuDriver = hasDriverProperties && !OperatingSystem.IsMacOS() ? + VendorUtils.GetFriendlyDriverName(driverProperties.DriverID) : GpuVendor; // Fallback to vendor name if driver is unavailable or on MacOS where vendor is preferred. + + fixed (byte* deviceName = properties.DeviceName) + { + GpuRenderer = Marshal.PtrToStringAnsi((nint)deviceName); + } + + GpuVersion = $"Vulkan v{ParseStandardVulkanVersion(properties.ApiVersion)}, Driver v{ParseDriverVersion(ref properties)}"; + + IsAmdGcn = !IsMoltenVk && Vendor == Vendor.Amd && VendorUtils.AmdGcnRegex().IsMatch(GpuRenderer); + + if (Vendor == Vendor.Nvidia) + { + var match = VendorUtils.NvidiaConsumerClassRegex().Match(GpuRenderer); + + if (match != null && int.TryParse(match.Groups[2].Value, out int gpuNumber)) + { + IsNvidiaPreTuring = gpuNumber < 2000; + } + else if (GpuRenderer.Contains("TITAN") && !GpuRenderer.Contains("RTX")) + { + IsNvidiaPreTuring = true; + } + } + else if (Vendor == Vendor.Intel) + { + IsIntelArc = GpuRenderer.StartsWith("Intel(R) Arc(TM)"); + } + + IsQualcommProprietary = hasDriverProperties && driverProperties.DriverID == DriverId.QualcommProprietary; + + ulong minResourceAlignment = Math.Max( + Math.Max( + properties.Limits.MinStorageBufferOffsetAlignment, + properties.Limits.MinUniformBufferOffsetAlignment), + properties.Limits.MinTexelBufferOffsetAlignment + ); + + SampleCountFlags supportedSampleCounts = + properties.Limits.FramebufferColorSampleCounts & + properties.Limits.FramebufferDepthSampleCounts & + properties.Limits.FramebufferStencilSampleCounts; + + Capabilities = new HardwareCapabilities( + _physicalDevice.IsDeviceExtensionPresent("VK_EXT_index_type_uint8"), + supportsCustomBorderColor, + supportsBlendOperationAdvanced, + propertiesBlendOperationAdvanced.AdvancedBlendCorrelatedOverlap, + propertiesBlendOperationAdvanced.AdvancedBlendNonPremultipliedSrcColor, + propertiesBlendOperationAdvanced.AdvancedBlendNonPremultipliedDstColor, + _physicalDevice.IsDeviceExtensionPresent(KhrDrawIndirectCount.ExtensionName), + _physicalDevice.IsDeviceExtensionPresent("VK_EXT_fragment_shader_interlock"), + _physicalDevice.IsDeviceExtensionPresent("VK_NV_geometry_shader_passthrough"), + features2.Features.ShaderFloat64, + featuresShaderInt8.ShaderInt8, + _physicalDevice.IsDeviceExtensionPresent("VK_EXT_shader_stencil_export"), + features2.Features.ShaderStorageImageMultisample, + _physicalDevice.IsDeviceExtensionPresent(ExtConditionalRendering.ExtensionName), + _physicalDevice.IsDeviceExtensionPresent(ExtExtendedDynamicState.ExtensionName), + features2.Features.MultiViewport && !(IsMoltenVk && Vendor == Vendor.Amd), // Workaround for AMD on MoltenVK issue + featuresRobustness2.NullDescriptor || IsMoltenVk, + supportsPushDescriptors && !IsMoltenVk, + propertiesPushDescriptor.MaxPushDescriptors, + featuresPrimitiveTopologyListRestart.PrimitiveTopologyListRestart, + featuresPrimitiveTopologyListRestart.PrimitiveTopologyPatchListRestart, + supportsTransformFeedback, + propertiesTransformFeedback.TransformFeedbackQueries, + features2.Features.OcclusionQueryPrecise, + _physicalDevice.PhysicalDeviceFeatures.PipelineStatisticsQuery, + _physicalDevice.PhysicalDeviceFeatures.GeometryShader, + _physicalDevice.PhysicalDeviceFeatures.TessellationShader, + _physicalDevice.IsDeviceExtensionPresent("VK_NV_viewport_array2"), + _physicalDevice.IsDeviceExtensionPresent(ExtExternalMemoryHost.ExtensionName), + supportsDepthClipControl && featuresDepthClipControl.DepthClipControl, + supportsAttachmentFeedbackLoop && featuresAttachmentFeedbackLoop.AttachmentFeedbackLoopLayout, + supportsDynamicAttachmentFeedbackLoop && featuresDynamicAttachmentFeedbackLoop.AttachmentFeedbackLoopDynamicState, + propertiesSubgroup.SubgroupSize, + supportedSampleCounts, + portabilityFlags, + vertexBufferAlignment, + properties.Limits.SubTexelPrecisionBits, + minResourceAlignment); + + IsSharedMemory = MemoryAllocator.IsDeviceMemoryShared(_physicalDevice); + + MemoryAllocator = new MemoryAllocator(Api, _physicalDevice, _device); + + Api.TryGetDeviceExtension(_instance.Instance, _device, out ExtExternalMemoryHost hostMemoryApi); + HostMemoryAllocator = new HostMemoryAllocator(MemoryAllocator, Api, hostMemoryApi, _device); + + CommandBufferPool = new CommandBufferPool(Api, _device, Queue, QueueLock, queueFamilyIndex, IsQualcommProprietary); + + PipelineLayoutCache = new PipelineLayoutCache(); + + BackgroundResources = new BackgroundResources(this, _device); + + BufferManager = new BufferManager(this, _device); + + SyncManager = new SyncManager(this, _device); + _pipeline = new PipelineFull(this, _device); + _pipeline.Initialize(); + + HelperShader = new HelperShader(this, _device); + + Barriers = new BarrierBatch(this); + + _counters = new Counters(this, _device, _pipeline); + } + + private void SetupContext(GraphicsDebugLevel logLevel) + { + _instance = VulkanInitialization.CreateInstance(Api, logLevel, _getRequiredExtensions()); + _debugMessenger = new VulkanDebugMessenger(Api, _instance.Instance, logLevel); + + if (Api.TryGetInstanceExtension(_instance.Instance, out KhrSurface surfaceApi)) + { + SurfaceApi = surfaceApi; + } + + _surface = _getSurface(_instance.Instance, Api); + _physicalDevice = VulkanInitialization.FindSuitablePhysicalDevice(Api, _instance, _surface, _preferredGpuId); + + var queueFamilyIndex = VulkanInitialization.FindSuitableQueueFamily(Api, _physicalDevice, _surface, out uint maxQueueCount); + + _device = VulkanInitialization.CreateDevice(Api, _physicalDevice, queueFamilyIndex, maxQueueCount); + + if (Api.TryGetDeviceExtension(_instance.Instance, _device, out KhrSwapchain swapchainApi)) + { + SwapchainApi = swapchainApi; + } + + Api.GetDeviceQueue(_device, queueFamilyIndex, 0, out var queue); + Queue = queue; + QueueLock = new(); + + LoadFeatures(maxQueueCount, queueFamilyIndex); + + QueueFamilyIndex = queueFamilyIndex; + + _window = new Window(this, _surface, _physicalDevice.PhysicalDevice, _device); + + _initialized = true; + } + + internal int[] GetPushDescriptorReservedBindings(bool isOgl) + { + // The first call of this method determines what push descriptor layout is used for all shaders on this renderer. + // This is chosen to minimize shaders that can't fit their uniforms on the device's max number of push descriptors. + if (_pdReservedBindings == null) + { + if (Capabilities.MaxPushDescriptors <= Constants.MaxUniformBuffersPerStage * 2) + { + _pdReservedBindings = isOgl ? _pdReservedBindingsOgl : _pdReservedBindingsNvn; + } + else + { + _pdReservedBindings = Array.Empty(); + } + } + + return _pdReservedBindings; + } + + public BufferHandle CreateBuffer(int size, BufferAccess access) + { + return BufferManager.CreateWithHandle(this, size, access.HasFlag(BufferAccess.SparseCompatible), access.Convert(), access.HasFlag(BufferAccess.Stream)); + } + + public BufferHandle CreateBuffer(nint pointer, int size) + { + return BufferManager.CreateHostImported(this, pointer, size); + } + + public BufferHandle CreateBufferSparse(ReadOnlySpan storageBuffers) + { + return BufferManager.CreateSparse(this, storageBuffers); + } + + public IImageArray CreateImageArray(int size, bool isBuffer) + { + return new ImageArray(this, size, isBuffer); + } + + public IProgram CreateProgram(ShaderSource[] sources, ShaderInfo info) + { + ProgramCount++; + + bool isCompute = sources.Length == 1 && sources[0].Stage == ShaderStage.Compute; + + if (info.State.HasValue || isCompute) + { + return new ShaderCollection(this, _device, sources, info.ResourceLayout, info.State ?? default, info.FromCache); + } + + return new ShaderCollection(this, _device, sources, info.ResourceLayout); + } + + internal ShaderCollection CreateProgramWithMinimalLayout(ShaderSource[] sources, ResourceLayout resourceLayout, SpecDescription[] specDescription = null) + { + return new ShaderCollection(this, _device, sources, resourceLayout, specDescription, isMinimal: true); + } + + public ISampler CreateSampler(SamplerCreateInfo info) + { + return new SamplerHolder(this, _device, info); + } + + public ITexture CreateTexture(TextureCreateInfo info) + { + if (info.Target == Target.TextureBuffer) + { + return new TextureBuffer(this, info); + } + + return CreateTextureView(info); + } + + public ITextureArray CreateTextureArray(int size, bool isBuffer) + { + return new TextureArray(this, size, isBuffer); + } + + internal TextureView CreateTextureView(TextureCreateInfo info) + { + // This should be disposed when all views are destroyed. + var storage = CreateTextureStorage(info); + return storage.CreateView(info, 0, 0); + } + + internal TextureStorage CreateTextureStorage(TextureCreateInfo info) + { + return new TextureStorage(this, _device, info); + } + + public void DeleteBuffer(BufferHandle buffer) + { + BufferManager.Delete(buffer); + } + + internal void FlushAllCommands() + { + _pipeline?.FlushCommandsImpl(); + } + + internal void RegisterFlush() + { + SyncManager.RegisterFlush(); + + // Periodically free unused regions of the staging buffer to avoid doing it all at once. + BufferManager.StagingBuffer.FreeCompleted(); + } + + public PinnedSpan GetBufferData(BufferHandle buffer, int offset, int size) + { + return BufferManager.GetData(buffer, offset, size); + } + + public unsafe Capabilities GetCapabilities() + { + FormatFeatureFlags compressedFormatFeatureFlags = + FormatFeatureFlags.SampledImageBit | + FormatFeatureFlags.SampledImageFilterLinearBit | + FormatFeatureFlags.BlitSrcBit | + FormatFeatureFlags.TransferSrcBit | + FormatFeatureFlags.TransferDstBit; + + bool supportsBc123CompressionFormat = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.Bc1RgbaSrgb, + Format.Bc1RgbaUnorm, + Format.Bc2Srgb, + Format.Bc2Unorm, + Format.Bc3Srgb, + Format.Bc3Unorm); + + bool supportsBc45CompressionFormat = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.Bc4Snorm, + Format.Bc4Unorm, + Format.Bc5Snorm, + Format.Bc5Unorm); + + bool supportsBc67CompressionFormat = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.Bc6HSfloat, + Format.Bc6HUfloat, + Format.Bc7Srgb, + Format.Bc7Unorm); + + bool supportsEtc2CompressionFormat = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.Etc2RgbaSrgb, + Format.Etc2RgbaUnorm, + Format.Etc2RgbPtaSrgb, + Format.Etc2RgbPtaUnorm, + Format.Etc2RgbSrgb, + Format.Etc2RgbUnorm); + + bool supports5BitComponentFormat = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.R5G6B5Unorm, + Format.R5G5B5A1Unorm, + Format.R5G5B5X1Unorm, + Format.B5G6R5Unorm, + Format.B5G5R5A1Unorm, + Format.A1B5G5R5Unorm); + + bool supportsR4G4B4A4Format = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.R4G4B4A4Unorm); + + bool supportsAstcFormats = FormatCapabilities.OptimalFormatsSupport(compressedFormatFeatureFlags, + Format.Astc4x4Unorm, + Format.Astc5x4Unorm, + Format.Astc5x5Unorm, + Format.Astc6x5Unorm, + Format.Astc6x6Unorm, + Format.Astc8x5Unorm, + Format.Astc8x6Unorm, + Format.Astc8x8Unorm, + Format.Astc10x5Unorm, + Format.Astc10x6Unorm, + Format.Astc10x8Unorm, + Format.Astc10x10Unorm, + Format.Astc12x10Unorm, + Format.Astc12x12Unorm, + Format.Astc4x4Srgb, + Format.Astc5x4Srgb, + Format.Astc5x5Srgb, + Format.Astc6x5Srgb, + Format.Astc6x6Srgb, + Format.Astc8x5Srgb, + Format.Astc8x6Srgb, + Format.Astc8x8Srgb, + Format.Astc10x5Srgb, + Format.Astc10x6Srgb, + Format.Astc10x8Srgb, + Format.Astc10x10Srgb, + Format.Astc12x10Srgb, + Format.Astc12x12Srgb); + + PhysicalDeviceVulkan12Features featuresVk12 = new() + { + SType = StructureType.PhysicalDeviceVulkan12Features, + }; + + PhysicalDeviceFeatures2 features2 = new() + { + SType = StructureType.PhysicalDeviceFeatures2, + PNext = &featuresVk12, + }; + + Api.GetPhysicalDeviceFeatures2(_physicalDevice.PhysicalDevice, &features2); + + var limits = _physicalDevice.PhysicalDeviceProperties.Limits; + var mainQueueProperties = _physicalDevice.QueueFamilyProperties[QueueFamilyIndex]; + + SystemMemoryType memoryType; + + if (IsSharedMemory) + { + memoryType = SystemMemoryType.UnifiedMemory; + } + else + { + memoryType = Vendor == Vendor.Nvidia ? + SystemMemoryType.DedicatedMemorySlowStorage : + SystemMemoryType.DedicatedMemory; + } + + return new Capabilities( + api: TargetApi.Vulkan, + GpuVendor, + memoryType: memoryType, + hasFrontFacingBug: IsIntelWindows, + hasVectorIndexingBug: IsQualcommProprietary, + needsFragmentOutputSpecialization: IsMoltenVk, + reduceShaderPrecision: IsMoltenVk, + supportsAstcCompression: features2.Features.TextureCompressionAstcLdr && supportsAstcFormats, + supportsBc123Compression: supportsBc123CompressionFormat, + supportsBc45Compression: supportsBc45CompressionFormat, + supportsBc67Compression: supportsBc67CompressionFormat, + supportsEtc2Compression: supportsEtc2CompressionFormat, + supports3DTextureCompression: true, + supportsBgraFormat: true, + supportsR4G4Format: false, + supportsR4G4B4A4Format: supportsR4G4B4A4Format, + supportsScaledVertexFormats: FormatCapabilities.SupportsScaledVertexFormats(), + supportsSnormBufferTextureFormat: true, + supports5BitComponentFormat: supports5BitComponentFormat, + supportsSparseBuffer: features2.Features.SparseBinding && mainQueueProperties.QueueFlags.HasFlag(QueueFlags.SparseBindingBit), + supportsBlendEquationAdvanced: Capabilities.SupportsBlendEquationAdvanced, + supportsFragmentShaderInterlock: Capabilities.SupportsFragmentShaderInterlock, + supportsFragmentShaderOrderingIntel: false, + supportsGeometryShader: Capabilities.SupportsGeometryShader, + supportsGeometryShaderPassthrough: Capabilities.SupportsGeometryShaderPassthrough, + supportsTransformFeedback: Capabilities.SupportsTransformFeedback, + supportsImageLoadFormatted: features2.Features.ShaderStorageImageReadWithoutFormat, + supportsLayerVertexTessellation: featuresVk12.ShaderOutputLayer, + supportsMismatchingViewFormat: true, + supportsCubemapView: !IsAmdGcn, + supportsNonConstantTextureOffset: false, + supportsQuads: false, + supportsSeparateSampler: true, + supportsShaderBallot: false, + supportsShaderBarrierDivergence: Vendor != Vendor.Intel, + supportsShaderFloat64: Capabilities.SupportsShaderFloat64, + supportsTextureGatherOffsets: features2.Features.ShaderImageGatherExtended && !IsMoltenVk, + supportsTextureShadowLod: false, + supportsVertexStoreAndAtomics: features2.Features.VertexPipelineStoresAndAtomics, + supportsViewportIndexVertexTessellation: featuresVk12.ShaderOutputViewportIndex, + supportsViewportMask: Capabilities.SupportsViewportArray2, + supportsViewportSwizzle: false, + supportsIndirectParameters: true, + supportsDepthClipControl: Capabilities.SupportsDepthClipControl, + uniformBufferSetIndex: PipelineBase.UniformSetIndex, + storageBufferSetIndex: PipelineBase.StorageSetIndex, + textureSetIndex: PipelineBase.TextureSetIndex, + imageSetIndex: PipelineBase.ImageSetIndex, + extraSetBaseIndex: PipelineBase.DescriptorSetLayouts, + maximumExtraSets: Math.Max(0, (int)limits.MaxBoundDescriptorSets - PipelineBase.DescriptorSetLayouts), + maximumUniformBuffersPerStage: Constants.MaxUniformBuffersPerStage, + maximumStorageBuffersPerStage: Constants.MaxStorageBuffersPerStage, + maximumTexturesPerStage: Constants.MaxTexturesPerStage, + maximumImagesPerStage: Constants.MaxImagesPerStage, + maximumComputeSharedMemorySize: (int)limits.MaxComputeSharedMemorySize, + maximumSupportedAnisotropy: (int)limits.MaxSamplerAnisotropy, + shaderSubgroupSize: (int)Capabilities.SubgroupSize, + storageBufferOffsetAlignment: (int)limits.MinStorageBufferOffsetAlignment, + textureBufferOffsetAlignment: (int)limits.MinTexelBufferOffsetAlignment, + gatherBiasPrecision: IsIntelWindows || IsAmdWindows ? (int)Capabilities.SubTexelPrecisionBits : 0, + maximumGpuMemory: GetTotalGPUMemory()); + } + + private ulong GetTotalGPUMemory() + { + ulong totalMemory = 0; + + Api.GetPhysicalDeviceMemoryProperties(_physicalDevice.PhysicalDevice, out PhysicalDeviceMemoryProperties memoryProperties); + + for (int i = 0; i < memoryProperties.MemoryHeapCount; i++) + { + var heap = memoryProperties.MemoryHeaps[i]; + if ((heap.Flags & MemoryHeapFlags.DeviceLocalBit) == MemoryHeapFlags.DeviceLocalBit) + { + totalMemory += heap.Size; + } + } + + return totalMemory; + } + + public HardwareInfo GetHardwareInfo() + { + return new HardwareInfo(GpuVendor, GpuRenderer, GpuDriver); + } + + /// + /// Gets the available Vulkan devices using the default Vulkan API + /// object returned by + /// + /// + public static DeviceInfo[] GetPhysicalDevices() + { + try + { + return VulkanInitialization.GetSuitablePhysicalDevices(Vk.GetApi()); + } + catch (Exception ex) + { + Logger.Error?.PrintMsg(LogClass.Gpu, $"Error querying Vulkan devices: {ex.Message}"); + + return Array.Empty(); + } + } + + public static DeviceInfo[] GetPhysicalDevices(Vk api) + { + try + { + return VulkanInitialization.GetSuitablePhysicalDevices(api); + } + catch (Exception) + { + // If we got an exception here, Vulkan is most likely not supported. + return Array.Empty(); + } + } + + private static string ParseStandardVulkanVersion(uint version) + { + return $"{version >> 22}.{(version >> 12) & 0x3FF}.{version & 0xFFF}"; + } + + private static string ParseDriverVersion(ref PhysicalDeviceProperties properties) + { + uint driverVersionRaw = properties.DriverVersion; + + // NVIDIA differ from the standard here and uses a different format. + if (properties.VendorID == 0x10DE) + { + return $"{(driverVersionRaw >> 22) & 0x3FF}.{(driverVersionRaw >> 14) & 0xFF}.{(driverVersionRaw >> 6) & 0xFF}.{driverVersionRaw & 0x3F}"; + } + + return ParseStandardVulkanVersion(driverVersionRaw); + } + + internal PrimitiveTopology TopologyRemap(PrimitiveTopology topology) + { + return topology switch + { + PrimitiveTopology.Quads => PrimitiveTopology.Triangles, + PrimitiveTopology.QuadStrip => PrimitiveTopology.TriangleStrip, + PrimitiveTopology.TriangleFan or PrimitiveTopology.Polygon => Capabilities.PortabilitySubset.HasFlag(PortabilitySubsetFlags.NoTriangleFans) + ? PrimitiveTopology.Triangles + : topology, + _ => topology, + }; + } + + internal bool TopologyUnsupported(PrimitiveTopology topology) + { + return topology switch + { + PrimitiveTopology.Quads => true, + PrimitiveTopology.TriangleFan or PrimitiveTopology.Polygon => Capabilities.PortabilitySubset.HasFlag(PortabilitySubsetFlags.NoTriangleFans), + _ => false, + }; + } + + private void PrintGpuInformation() + { + string gpuInfoMessage = $"{GpuRenderer} ({GpuVersion})"; + if (!GpuRenderer.StartsWithIgnoreCase(GpuVendor)) + gpuInfoMessage = gpuInfoMessage.Prepend(GpuVendor); + + Logger.Notice.Print(LogClass.Gpu, gpuInfoMessage); + + Logger.Notice.Print(LogClass.Gpu, $"GPU Memory: {GetTotalGPUMemory() / (1024 * 1024)} MiB"); + } + + public void Initialize(GraphicsDebugLevel logLevel) + { + SetupContext(logLevel); + + PrintGpuInformation(); + } + + internal bool NeedsVertexBufferAlignment(int attrScalarAlignment, out int alignment) + { + if (Capabilities.VertexBufferAlignment > 1) + { + alignment = (int)Capabilities.VertexBufferAlignment; + + return true; + } + else if (Vendor != Vendor.Nvidia) + { + // Vulkan requires that vertex attributes are globally aligned by their component size, + // so buffer strides that don't divide by the largest scalar element are invalid. + // Guest applications do this, NVIDIA GPUs are OK with it, others are not. + + alignment = attrScalarAlignment; + + return true; + } + + alignment = 1; + + return false; + } + + public void PreFrame() + { + SyncManager.Cleanup(); + } + + public ICounterEvent ReportCounter(CounterType type, EventHandler resultHandler, float divisor, bool hostReserved) + { + return _counters.QueueReport(type, resultHandler, divisor, hostReserved); + } + + public void ResetCounter(CounterType type) + { + _counters.QueueReset(type); + } + + public void SetBufferData(BufferHandle buffer, int offset, ReadOnlySpan data) + { + BufferManager.SetData(buffer, offset, data, _pipeline.CurrentCommandBuffer, _pipeline.EndRenderPassDelegate); + } + + public void UpdateCounters() + { + _counters.Update(); + } + + public void ResetCounterPool() + { + _counters.ResetCounterPool(); + } + + public void ResetFutureCounters(CommandBuffer cmd, int count) + { + _counters?.ResetFutureCounters(cmd, count); + } + + public void BackgroundContextAction(Action action, bool alwaysBackground = false) + { + action(); + } + + public void CreateSync(ulong id, bool strict) + { + SyncManager.Create(id, strict); + } + + public IProgram LoadProgramBinary(byte[] programBinary, bool isFragment, ShaderInfo info) + { + throw new NotImplementedException(); + } + + public void WaitSync(ulong id) + { + SyncManager.Wait(id); + } + + public ulong GetCurrentSync() + { + return SyncManager.GetCurrent(); + } + + public void SetInterruptAction(Action interruptAction) + { + InterruptAction = interruptAction; + } + + public void Screenshot() + { + _window.ScreenCaptureRequested = true; + } + + public void OnScreenCaptured(ScreenCaptureImageInfo bitmap) + { + ScreenCaptured?.Invoke(this, bitmap); + } + + public bool SupportsRenderPassBarrier(PipelineStageFlags flags) + { + return !(IsMoltenVk || IsQualcommProprietary); + } + + public unsafe void Dispose() + { + if (!_initialized) + { + return; + } + + CommandBufferPool.Dispose(); + BackgroundResources.Dispose(); + _counters.Dispose(); + _window.Dispose(); + HelperShader.Dispose(); + _pipeline.Dispose(); + BufferManager.Dispose(); + PipelineLayoutCache.Dispose(); + Barriers.Dispose(); + + MemoryAllocator.Dispose(); + + foreach (var shader in Shaders) + { + shader.Dispose(); + } + + foreach (var texture in Textures) + { + texture.Release(); + } + + foreach (var sampler in Samplers) + { + sampler.Dispose(); + } + + SurfaceApi.DestroySurface(_instance.Instance, _surface, null); + + Api.DestroyDevice(_device, null); + + _debugMessenger.Dispose(); + + // Last step destroy the instance + _instance.Dispose(); + } + + public bool PrepareHostMapping(nint address, ulong size) + { + return Capabilities.SupportsHostImportedMemory && + HostMemoryAllocator.TryImport(BufferManager.HostImportedBufferMemoryRequirements, BufferManager.DefaultBufferMemoryFlags, address, size); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/Window.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/Window.cs new file mode 100644 index 000000000..f3fe10b90 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/Window.cs @@ -0,0 +1,679 @@ +using Ryujinx.Common.Configuration; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Rdna3Vulkan.Effects; +using Silk.NET.Vulkan; +using Silk.NET.Vulkan.Extensions.KHR; +using System; +using System.Linq; +using VkFormat = Silk.NET.Vulkan.Format; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + class Window : WindowBase, IDisposable + { + private const int SurfaceWidth = 1280; + private const int SurfaceHeight = 720; + + private readonly VulkanRenderer _gd; + private readonly SurfaceKHR _surface; + private readonly PhysicalDevice _physicalDevice; + private readonly Device _device; + private SwapchainKHR _swapchain; + + private Image[] _swapchainImages; + private TextureView[] _swapchainImageViews; + + private Semaphore[] _imageAvailableSemaphores; + private Semaphore[] _renderFinishedSemaphores; + + private int _frameIndex; + + private int _width; + private int _height; + private VSyncMode _vSyncMode; + private bool _swapchainIsDirty; + private VkFormat _format; + private AntiAliasing _currentAntiAliasing; + private bool _updateEffect; + private IPostProcessingEffect _effect; + private IScalingFilter _scalingFilter; + private bool _isLinear; + private float _scalingFilterLevel; + private bool _updateScalingFilter; + private ScalingFilter _currentScalingFilter; + private bool _colorSpacePassthroughEnabled; + + public unsafe Window(VulkanRenderer gd, SurfaceKHR surface, PhysicalDevice physicalDevice, Device device) + { + _gd = gd; + _physicalDevice = physicalDevice; + _device = device; + _surface = surface; + + CreateSwapchain(); + } + + private void RecreateSwapchain() + { + var oldSwapchain = _swapchain; + _swapchainIsDirty = false; + + for (int i = 0; i < _swapchainImageViews.Length; i++) + { + _swapchainImageViews[i].Dispose(); + } + + // Destroy old Swapchain. + + _gd.Api.DeviceWaitIdle(_device); + + unsafe + { + for (int i = 0; i < _imageAvailableSemaphores.Length; i++) + { + _gd.Api.DestroySemaphore(_device, _imageAvailableSemaphores[i], null); + } + + for (int i = 0; i < _renderFinishedSemaphores.Length; i++) + { + _gd.Api.DestroySemaphore(_device, _renderFinishedSemaphores[i], null); + } + } + + _gd.SwapchainApi.DestroySwapchain(_device, oldSwapchain, Span.Empty); + + CreateSwapchain(); + } + + private unsafe void CreateSwapchain() + { + _gd.SurfaceApi.GetPhysicalDeviceSurfaceCapabilities(_physicalDevice, _surface, out var capabilities); + + uint surfaceFormatsCount; + + _gd.SurfaceApi.GetPhysicalDeviceSurfaceFormats(_physicalDevice, _surface, &surfaceFormatsCount, null); + + var surfaceFormats = new SurfaceFormatKHR[surfaceFormatsCount]; + + fixed (SurfaceFormatKHR* pSurfaceFormats = surfaceFormats) + { + _gd.SurfaceApi.GetPhysicalDeviceSurfaceFormats(_physicalDevice, _surface, &surfaceFormatsCount, pSurfaceFormats); + } + + uint presentModesCount; + + _gd.SurfaceApi.GetPhysicalDeviceSurfacePresentModes(_physicalDevice, _surface, &presentModesCount, null); + + var presentModes = new PresentModeKHR[presentModesCount]; + + fixed (PresentModeKHR* pPresentModes = presentModes) + { + _gd.SurfaceApi.GetPhysicalDeviceSurfacePresentModes(_physicalDevice, _surface, &presentModesCount, pPresentModes); + } + + uint imageCount = capabilities.MinImageCount + 1; + if (capabilities.MaxImageCount > 0 && imageCount > capabilities.MaxImageCount) + { + imageCount = capabilities.MaxImageCount; + } + + var surfaceFormat = ChooseSwapSurfaceFormat(surfaceFormats, _colorSpacePassthroughEnabled); + + var extent = ChooseSwapExtent(capabilities); + + _width = (int)extent.Width; + _height = (int)extent.Height; + _format = surfaceFormat.Format; + + var oldSwapchain = _swapchain; + + var swapchainCreateInfo = new SwapchainCreateInfoKHR + { + SType = StructureType.SwapchainCreateInfoKhr, + Surface = _surface, + MinImageCount = imageCount, + ImageFormat = surfaceFormat.Format, + ImageColorSpace = surfaceFormat.ColorSpace, + ImageExtent = extent, + ImageUsage = ImageUsageFlags.ColorAttachmentBit | ImageUsageFlags.TransferDstBit | ImageUsageFlags.StorageBit, + ImageSharingMode = SharingMode.Exclusive, + ImageArrayLayers = 1, + PreTransform = capabilities.CurrentTransform, + CompositeAlpha = ChooseCompositeAlpha(capabilities.SupportedCompositeAlpha), + PresentMode = ChooseSwapPresentMode(presentModes, _vSyncMode), + Clipped = true, + }; + + var textureCreateInfo = new TextureCreateInfo( + _width, + _height, + 1, + 1, + 1, + 1, + 1, + 1, + FormatTable.GetFormat(surfaceFormat.Format), + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + _gd.SwapchainApi.CreateSwapchain(_device, in swapchainCreateInfo, null, out _swapchain).ThrowOnError(); + + _gd.SwapchainApi.GetSwapchainImages(_device, _swapchain, &imageCount, null); + + _swapchainImages = new Image[imageCount]; + + fixed (Image* pSwapchainImages = _swapchainImages) + { + _gd.SwapchainApi.GetSwapchainImages(_device, _swapchain, &imageCount, pSwapchainImages); + } + + _swapchainImageViews = new TextureView[imageCount]; + + for (int i = 0; i < _swapchainImageViews.Length; i++) + { + _swapchainImageViews[i] = CreateSwapchainImageView(_swapchainImages[i], surfaceFormat.Format, textureCreateInfo); + } + + var semaphoreCreateInfo = new SemaphoreCreateInfo + { + SType = StructureType.SemaphoreCreateInfo, + }; + + _imageAvailableSemaphores = new Semaphore[imageCount]; + + for (int i = 0; i < _imageAvailableSemaphores.Length; i++) + { + _gd.Api.CreateSemaphore(_device, in semaphoreCreateInfo, null, out _imageAvailableSemaphores[i]).ThrowOnError(); + } + + _renderFinishedSemaphores = new Semaphore[imageCount]; + + for (int i = 0; i < _renderFinishedSemaphores.Length; i++) + { + _gd.Api.CreateSemaphore(_device, in semaphoreCreateInfo, null, out _renderFinishedSemaphores[i]).ThrowOnError(); + } + } + + private unsafe TextureView CreateSwapchainImageView(Image swapchainImage, VkFormat format, TextureCreateInfo info) + { + var componentMapping = new ComponentMapping( + ComponentSwizzle.R, + ComponentSwizzle.G, + ComponentSwizzle.B, + ComponentSwizzle.A); + + var aspectFlags = ImageAspectFlags.ColorBit; + + var subresourceRange = new ImageSubresourceRange(aspectFlags, 0, 1, 0, 1); + + var imageCreateInfo = new ImageViewCreateInfo + { + SType = StructureType.ImageViewCreateInfo, + Image = swapchainImage, + ViewType = ImageViewType.Type2D, + Format = format, + Components = componentMapping, + SubresourceRange = subresourceRange, + }; + + _gd.Api.CreateImageView(_device, in imageCreateInfo, null, out var imageView).ThrowOnError(); + + return new TextureView(_gd, _device, new DisposableImageView(_gd.Api, _device, imageView), info, format); + } + + private static SurfaceFormatKHR ChooseSwapSurfaceFormat(SurfaceFormatKHR[] availableFormats, bool colorSpacePassthroughEnabled) + { + if (availableFormats.Length == 1 && availableFormats[0].Format == VkFormat.Undefined) + { + return new SurfaceFormatKHR(VkFormat.B8G8R8A8Unorm, ColorSpaceKHR.PaceSrgbNonlinearKhr); + } + + var formatToReturn = availableFormats[0]; + if (colorSpacePassthroughEnabled) + { + foreach (var format in availableFormats) + { + if (format.Format == VkFormat.B8G8R8A8Unorm && format.ColorSpace == ColorSpaceKHR.SpacePassThroughExt) + { + formatToReturn = format; + break; + } + else if (format.Format == VkFormat.B8G8R8A8Unorm && format.ColorSpace == ColorSpaceKHR.PaceSrgbNonlinearKhr) + { + formatToReturn = format; + } + } + } + else + { + foreach (var format in availableFormats) + { + if (format.Format == VkFormat.B8G8R8A8Unorm && format.ColorSpace == ColorSpaceKHR.PaceSrgbNonlinearKhr) + { + formatToReturn = format; + break; + } + } + } + + return formatToReturn; + } + + private static CompositeAlphaFlagsKHR ChooseCompositeAlpha(CompositeAlphaFlagsKHR supportedFlags) + { + if (supportedFlags.HasFlag(CompositeAlphaFlagsKHR.OpaqueBitKhr)) + { + return CompositeAlphaFlagsKHR.OpaqueBitKhr; + } + else if (supportedFlags.HasFlag(CompositeAlphaFlagsKHR.PreMultipliedBitKhr)) + { + return CompositeAlphaFlagsKHR.PreMultipliedBitKhr; + } + else + { + return CompositeAlphaFlagsKHR.InheritBitKhr; + } + } + + private static PresentModeKHR ChooseSwapPresentMode(PresentModeKHR[] availablePresentModes, VSyncMode vSyncMode) + { + if (vSyncMode == VSyncMode.Unbounded && availablePresentModes.Contains(PresentModeKHR.ImmediateKhr)) + { + return PresentModeKHR.ImmediateKhr; + } + else if (availablePresentModes.Contains(PresentModeKHR.MailboxKhr)) + { + return PresentModeKHR.MailboxKhr; + } + else + { + return PresentModeKHR.FifoKhr; + } + } + + public static Extent2D ChooseSwapExtent(SurfaceCapabilitiesKHR capabilities) + { + if (capabilities.CurrentExtent.Width != uint.MaxValue) + { + return capabilities.CurrentExtent; + } + + uint width = Math.Max(capabilities.MinImageExtent.Width, Math.Min(capabilities.MaxImageExtent.Width, SurfaceWidth)); + uint height = Math.Max(capabilities.MinImageExtent.Height, Math.Min(capabilities.MaxImageExtent.Height, SurfaceHeight)); + + return new Extent2D(width, height); + } + + public unsafe override void Present(ITexture texture, ImageCrop crop, Action swapBuffersCallback) + { + _gd.PipelineInternal.AutoFlush.Present(); + + uint nextImage = 0; + int semaphoreIndex = _frameIndex++ % _imageAvailableSemaphores.Length; + + while (true) + { + var acquireResult = _gd.SwapchainApi.AcquireNextImage( + _device, + _swapchain, + ulong.MaxValue, + _imageAvailableSemaphores[semaphoreIndex], + new Fence(), + ref nextImage); + + if (acquireResult == Result.ErrorOutOfDateKhr || + acquireResult == Result.SuboptimalKhr || + _swapchainIsDirty) + { + RecreateSwapchain(); + semaphoreIndex = (_frameIndex - 1) % _imageAvailableSemaphores.Length; + } + else + { + acquireResult.ThrowOnError(); + break; + } + } + + var swapchainImage = _swapchainImages[nextImage]; + + _gd.FlushAllCommands(); + + var cbs = _gd.CommandBufferPool.Rent(); + + Transition( + cbs.CommandBuffer, + swapchainImage, + 0, + AccessFlags.TransferWriteBit, + ImageLayout.Undefined, + ImageLayout.General); + + var view = (TextureView)texture; + + UpdateEffect(); + + if (_effect != null) + { + view = _effect.Run(view, cbs, _width, _height); + } + + int srcX0, srcX1, srcY0, srcY1; + + if (crop.Left == 0 && crop.Right == 0) + { + srcX0 = 0; + srcX1 = view.Width; + } + else + { + srcX0 = crop.Left; + srcX1 = crop.Right; + } + + if (crop.Top == 0 && crop.Bottom == 0) + { + srcY0 = 0; + srcY1 = view.Height; + } + else + { + srcY0 = crop.Top; + srcY1 = crop.Bottom; + } + + if (ScreenCaptureRequested) + { + if (_effect != null) + { + _gd.CommandBufferPool.Return( + cbs, + null, + stackalloc[] { PipelineStageFlags.ColorAttachmentOutputBit }, + null); + _gd.FlushAllCommands(); + cbs.GetFence().Wait(); + cbs = _gd.CommandBufferPool.Rent(); + } + + CaptureFrame(view, srcX0, srcY0, srcX1 - srcX0, srcY1 - srcY0, view.Info.Format.IsBgr(), crop.FlipX, crop.FlipY); + + ScreenCaptureRequested = false; + } + + float ratioX = crop.IsStretched ? 1.0f : MathF.Min(1.0f, _height * crop.AspectRatioX / (_width * crop.AspectRatioY)); + float ratioY = crop.IsStretched ? 1.0f : MathF.Min(1.0f, _width * crop.AspectRatioY / (_height * crop.AspectRatioX)); + + int dstWidth = (int)(_width * ratioX); + int dstHeight = (int)(_height * ratioY); + + int dstPaddingX = (_width - dstWidth) / 2; + int dstPaddingY = (_height - dstHeight) / 2; + + int dstX0 = crop.FlipX ? _width - dstPaddingX : dstPaddingX; + int dstX1 = crop.FlipX ? dstPaddingX : _width - dstPaddingX; + + int dstY0 = crop.FlipY ? dstPaddingY : _height - dstPaddingY; + int dstY1 = crop.FlipY ? _height - dstPaddingY : dstPaddingY; + + if (_scalingFilter != null) + { + _scalingFilter.Run( + view, + cbs, + _swapchainImageViews[nextImage].GetImageViewForAttachment(), + _format, + _width, + _height, + new Extents2D(srcX0, srcY0, srcX1, srcY1), + new Extents2D(dstX0, dstY0, dstX1, dstY1) + ); + } + else + { + _gd.HelperShader.BlitColor( + _gd, + cbs, + view, + _swapchainImageViews[nextImage], + new Extents2D(srcX0, srcY0, srcX1, srcY1), + new Extents2D(dstX0, dstY1, dstX1, dstY0), + _isLinear, + true); + } + + Transition( + cbs.CommandBuffer, + swapchainImage, + 0, + 0, + ImageLayout.General, + ImageLayout.PresentSrcKhr); + + _gd.CommandBufferPool.Return( + cbs, + stackalloc[] { _imageAvailableSemaphores[semaphoreIndex] }, + stackalloc[] { PipelineStageFlags.ColorAttachmentOutputBit }, + stackalloc[] { _renderFinishedSemaphores[semaphoreIndex] }); + + // TODO: Present queue. + var semaphore = _renderFinishedSemaphores[semaphoreIndex]; + var swapchain = _swapchain; + + Result result; + + var presentInfo = new PresentInfoKHR + { + SType = StructureType.PresentInfoKhr, + WaitSemaphoreCount = 1, + PWaitSemaphores = &semaphore, + SwapchainCount = 1, + PSwapchains = &swapchain, + PImageIndices = &nextImage, + PResults = &result, + }; + + lock (_gd.QueueLock) + { + _gd.SwapchainApi.QueuePresent(_gd.Queue, in presentInfo); + } + } + + public override void SetAntiAliasing(AntiAliasing effect) + { + if (_currentAntiAliasing == effect && _effect != null) + { + return; + } + + _currentAntiAliasing = effect; + + _updateEffect = true; + } + + public override void SetScalingFilter(ScalingFilter type) + { + if (_currentScalingFilter == type && _effect != null) + { + return; + } + + _currentScalingFilter = type; + + _updateScalingFilter = true; + } + + public override void SetColorSpacePassthrough(bool colorSpacePassthroughEnabled) + { + _colorSpacePassthroughEnabled = colorSpacePassthroughEnabled; + _swapchainIsDirty = true; + } + + private void UpdateEffect() + { + if (_updateEffect) + { + _updateEffect = false; + + switch (_currentAntiAliasing) + { + case AntiAliasing.Fxaa: + _effect?.Dispose(); + _effect = new FxaaPostProcessingEffect(_gd, _device); + break; + case AntiAliasing.None: + _effect?.Dispose(); + _effect = null; + break; + case AntiAliasing.SmaaLow: + case AntiAliasing.SmaaMedium: + case AntiAliasing.SmaaHigh: + case AntiAliasing.SmaaUltra: + var quality = _currentAntiAliasing - AntiAliasing.SmaaLow; + if (_effect is SmaaPostProcessingEffect smaa) + { + smaa.Quality = quality; + } + else + { + _effect?.Dispose(); + _effect = new SmaaPostProcessingEffect(_gd, _device, quality); + } + break; + } + } + + if (_updateScalingFilter) + { + _updateScalingFilter = false; + + switch (_currentScalingFilter) + { + case ScalingFilter.Bilinear: + case ScalingFilter.Nearest: + _scalingFilter?.Dispose(); + _scalingFilter = null; + _isLinear = _currentScalingFilter == ScalingFilter.Bilinear; + break; + case ScalingFilter.Fsr: + if (_scalingFilter is not FsrScalingFilter) + { + _scalingFilter?.Dispose(); + _scalingFilter = new FsrScalingFilter(_gd, _device); + } + + _scalingFilter.Level = _scalingFilterLevel; + break; + case ScalingFilter.Area: + if (_scalingFilter is not AreaScalingFilter) + { + _scalingFilter?.Dispose(); + _scalingFilter = new AreaScalingFilter(_gd, _device); + } + break; + } + } + } + + public override void SetScalingFilterLevel(float level) + { + _scalingFilterLevel = level; + _updateScalingFilter = true; + } + + private unsafe void Transition( + CommandBuffer commandBuffer, + Image image, + AccessFlags srcAccess, + AccessFlags dstAccess, + ImageLayout srcLayout, + ImageLayout dstLayout) + { + var subresourceRange = new ImageSubresourceRange(ImageAspectFlags.ColorBit, 0, 1, 0, 1); + + var barrier = new ImageMemoryBarrier + { + SType = StructureType.ImageMemoryBarrier, + SrcAccessMask = srcAccess, + DstAccessMask = dstAccess, + OldLayout = srcLayout, + NewLayout = dstLayout, + SrcQueueFamilyIndex = Vk.QueueFamilyIgnored, + DstQueueFamilyIndex = Vk.QueueFamilyIgnored, + Image = image, + SubresourceRange = subresourceRange, + }; + + _gd.Api.CmdPipelineBarrier( + commandBuffer, + PipelineStageFlags.TopOfPipeBit, + PipelineStageFlags.AllCommandsBit, + 0, + 0, + null, + 0, + null, + 1, + in barrier); + } + + private void CaptureFrame(TextureView texture, int x, int y, int width, int height, bool isBgra, bool flipX, bool flipY) + { + byte[] bitmap = texture.GetData(x, y, width, height); + + _gd.OnScreenCaptured(new ScreenCaptureImageInfo(width, height, isBgra, bitmap, flipX, flipY)); + } + + public override void SetSize(int width, int height) + { + // We don't need to use width and height as we can get the size from the surface. + _swapchainIsDirty = true; + } + + public override void ChangeVSyncMode(VSyncMode vSyncMode) + { + _vSyncMode = vSyncMode; + //present mode may change, so mark the swapchain for recreation + _swapchainIsDirty = true; + } + + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + unsafe + { + for (int i = 0; i < _swapchainImageViews.Length; i++) + { + _swapchainImageViews[i].Dispose(); + } + + for (int i = 0; i < _imageAvailableSemaphores.Length; i++) + { + _gd.Api.DestroySemaphore(_device, _imageAvailableSemaphores[i], null); + } + + for (int i = 0; i < _renderFinishedSemaphores.Length; i++) + { + _gd.Api.DestroySemaphore(_device, _renderFinishedSemaphores[i], null); + } + + _gd.SwapchainApi.DestroySwapchain(_device, _swapchain, null); + } + + _effect?.Dispose(); + _scalingFilter?.Dispose(); + } + } + + public override void Dispose() + { + Dispose(true); + } + } +} diff --git a/src/Ryujinx.Graphics.Rdna3Vulkan/WindowBase.cs b/src/Ryujinx.Graphics.Rdna3Vulkan/WindowBase.cs new file mode 100644 index 000000000..1cc05c781 --- /dev/null +++ b/src/Ryujinx.Graphics.Rdna3Vulkan/WindowBase.cs @@ -0,0 +1,20 @@ +using Ryujinx.Common.Configuration; +using Ryujinx.Graphics.GAL; +using System; + +namespace Ryujinx.Graphics.Rdna3Vulkan +{ + internal abstract class WindowBase : IWindow + { + public bool ScreenCaptureRequested { get; set; } + + public abstract void Dispose(); + public abstract void Present(ITexture texture, ImageCrop crop, Action swapBuffersCallback); + public abstract void SetSize(int width, int height); + public abstract void ChangeVSyncMode(VSyncMode vSyncMode); + public abstract void SetAntiAliasing(AntiAliasing effect); + public abstract void SetScalingFilter(ScalingFilter scalerType); + public abstract void SetScalingFilterLevel(float scale); + public abstract void SetColorSpacePassthrough(bool colorSpacePassthroughEnabled); + } +}